aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.clang-format6
-rw-r--r--.gitignore37
-rw-r--r--.gn (renamed from files/.gn)34
-rw-r--r--.vpython52
-rw-r--r--.vpython3405
-rw-r--r--AUTHORS (renamed from files/AUTHORS)2
-rw-r--r--Android.bp179
-rw-r--r--BUILD14
-rw-r--r--BUILD.gn (renamed from files/BUILD.gn)163
-rw-r--r--CM_linux_packages.cmake (renamed from files/CM_linux_packages.cmake)4
-rw-r--r--CMakeLists.txt (renamed from files/CMakeLists.txt)27
-rw-r--r--DEPS2576
-rw-r--r--DIR_METADATA3
-rw-r--r--LICENSE2
-rw-r--r--METADATA16
-rw-r--r--OWNERS15
-rw-r--r--OWNERS.android1
-rw-r--r--PATENTS (renamed from files/PATENTS)0
-rw-r--r--[-rwxr-xr-x]PRESUBMIT.py (renamed from files/PRESUBMIT.py)50
-rw-r--r--README.chromium (renamed from files/README.chromium)6
-rw-r--r--README.md (renamed from files/README.md)1
-rw-r--r--README.version3
-rw-r--r--build_overrides/build.gni (renamed from files/build_overrides/build.gni)23
-rw-r--r--build_overrides/gtest.gni (renamed from files/build_overrides/gtest.gni)0
-rw-r--r--build_overrides/partition_alloc.gni17
-rwxr-xr-xcleanup_links.py (renamed from files/cleanup_links.py)27
-rw-r--r--codereview.settings4
-rw-r--r--docs/deprecated_builds.md (renamed from files/docs/deprecated_builds.md)5
-rw-r--r--docs/environment_variables.md (renamed from files/docs/environment_variables.md)9
-rw-r--r--docs/filtering.md (renamed from files/docs/filtering.md)0
-rw-r--r--docs/formats.md (renamed from files/docs/formats.md)55
-rw-r--r--docs/getting_started.md (renamed from files/docs/getting_started.md)54
-rw-r--r--docs/rotation.md (renamed from files/docs/rotation.md)4
-rw-r--r--download_vs_toolchain.py (renamed from files/download_vs_toolchain.py)6
-rw-r--r--files/.gitignore94
-rw-r--r--files/Android.bp179
-rw-r--r--files/DEPS1096
-rw-r--r--files/LICENSE29
-rw-r--r--files/LICENSE_THIRD_PARTY8
-rw-r--r--files/all.gyp21
-rw-r--r--files/chromium/.gclient20
-rw-r--r--files/chromium/README5
-rwxr-xr-xfiles/gyp_libyuv101
-rw-r--r--files/gyp_libyuv.py28
-rw-r--r--files/include/libyuv/convert.h504
-rw-r--r--files/include/libyuv/convert_argb.h721
-rw-r--r--files/include/libyuv/rotate.h182
-rw-r--r--files/include/libyuv/scale.h179
-rw-r--r--files/infra/config/PRESUBMIT.py15
-rw-r--r--files/infra/config/README.md1
-rw-r--r--files/infra/config/cq.cfg51
-rw-r--r--files/libyuv_nacl.gyp37
-rw-r--r--files/libyuv_test.gyp203
-rw-r--r--files/public.mk13
-rwxr-xr-xfiles/setup_links.py497
-rw-r--r--files/source/compare_gcc.cc360
-rw-r--r--files/source/compare_mmi.cc123
-rw-r--r--files/source/compare_neon.cc96
-rw-r--r--files/source/compare_neon64.cc90
-rw-r--r--files/source/convert.cc2576
-rw-r--r--files/source/convert_argb.cc2371
-rw-r--r--files/source/convert_from.cc1505
-rw-r--r--files/source/rotate.cc605
-rw-r--r--files/source/rotate_common.cc106
-rw-r--r--files/source/rotate_dspr2.cc475
-rw-r--r--files/source/rotate_gcc.cc374
-rw-r--r--files/source/rotate_mmi.cc291
-rw-r--r--files/source/row_any.cc1429
-rw-r--r--files/source/row_dspr2.cc1721
-rw-r--r--files/source/row_gcc.cc6798
-rw-r--r--files/source/row_mmi.cc6042
-rw-r--r--files/source/row_neon.cc2892
-rw-r--r--files/source/row_neon64.cc3036
-rw-r--r--files/source/scale_any.cc575
-rw-r--r--files/source/scale_dspr2.cc668
-rw-r--r--files/source/scale_gcc.cc1374
-rw-r--r--files/source/scale_mmi.cc1113
-rw-r--r--files/source/scale_neon.cc958
-rw-r--r--files/source/scale_neon64.cc1052
-rwxr-xr-xfiles/sync_chromium.py154
-rw-r--r--files/third_party/gflags/BUILD.gn73
-rw-r--r--files/third_party/gflags/LICENSE28
-rw-r--r--files/third_party/gflags/README.libyuv28
-rw-r--r--files/third_party/gflags/gen/posix/include/gflags/gflags.h573
-rw-r--r--files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h121
-rw-r--r--files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h141
-rw-r--r--files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h101
-rw-r--r--files/third_party/gflags/gen/posix/include/private/config.h112
-rw-r--r--files/third_party/gflags/gen/win/include/gflags/gflags.h573
-rw-r--r--files/third_party/gflags/gen/win/include/gflags/gflags_completions.h121
-rw-r--r--files/third_party/gflags/gen/win/include/gflags/gflags_declare.h141
-rw-r--r--files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h101
-rw-r--r--files/third_party/gflags/gen/win/include/private/config.h112
-rw-r--r--files/third_party/gflags/gflags.gyp92
-rw-r--r--files/tools/OWNERS61
-rw-r--r--files/tools/msan/OWNERS3
-rw-r--r--files/tools/msan/blacklist.txt24
-rw-r--r--files/tools/ubsan/OWNERS3
-rw-r--r--files/tools/ubsan/blacklist.txt77
-rw-r--r--files/tools/ubsan/vptr_blacklist.txt128
-rwxr-xr-xfiles/tools_libyuv/autoroller/roll_deps.py507
-rw-r--r--files/tools_libyuv/autoroller/unittests/.DS_Storebin6148 -> 0 bytes
-rwxr-xr-xfiles/tools_libyuv/valgrind/chrome_tests.bat53
-rwxr-xr-xfiles/tools_libyuv/valgrind/chrome_tests.py869
-rwxr-xr-xfiles/tools_libyuv/valgrind/chrome_tests.sh94
-rw-r--r--files/tools_libyuv/valgrind/common.py256
-rw-r--r--files/tools_libyuv/valgrind/gdb_helper.py91
-rw-r--r--files/tools_libyuv/valgrind/libyuv_tests.bat79
-rwxr-xr-xfiles/tools_libyuv/valgrind/libyuv_tests.py139
-rwxr-xr-xfiles/tools_libyuv/valgrind/libyuv_tests.sh101
-rwxr-xr-xfiles/tools_libyuv/valgrind/locate_valgrind.sh73
-rw-r--r--files/tools_libyuv/valgrind/memcheck/OWNERS1
-rw-r--r--files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py99
-rw-r--r--files/tools_libyuv/valgrind/memcheck/suppressions.txt21
-rw-r--r--files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt5
-rw-r--r--files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt5
-rwxr-xr-xfiles/tools_libyuv/valgrind/memcheck_analyze.py644
-rwxr-xr-xfiles/tools_libyuv/valgrind/valgrind.sh110
-rwxr-xr-xfiles/tools_libyuv/valgrind/valgrind_test.py517
-rw-r--r--files/unit_test/convert_test.cc3223
-rw-r--r--files/unit_test/cpu_test.cc186
-rw-r--r--files/unit_test/rotate_test.cc394
-rwxr-xr-xfiles/util/android/test_runner.py37
-rw-r--r--fuzz/Android.bp (renamed from files/fuzz/Android.bp)4
-rw-r--r--fuzz/OWNERS (renamed from files/fuzz/OWNERS)0
-rw-r--r--fuzz/mjpeg_dec_fuzz.cc (renamed from files/fuzz/mjpeg_dec_fuzz.cc)0
-rw-r--r--include/libyuv.h (renamed from files/include/libyuv.h)1
-rw-r--r--include/libyuv/basic_types.h (renamed from files/include/libyuv/basic_types.h)0
-rw-r--r--include/libyuv/compare.h (renamed from files/include/libyuv/compare.h)0
-rw-r--r--include/libyuv/compare_row.h (renamed from files/include/libyuv/compare_row.h)26
-rw-r--r--include/libyuv/convert.h1045
-rw-r--r--include/libyuv/convert_argb.h2315
-rw-r--r--include/libyuv/convert_from.h (renamed from files/include/libyuv/convert_from.h)211
-rw-r--r--include/libyuv/convert_from_argb.h (renamed from files/include/libyuv/convert_from_argb.h)117
-rw-r--r--include/libyuv/cpu_id.h (renamed from files/include/libyuv/cpu_id.h)31
-rw-r--r--include/libyuv/loongson_intrinsics.h1949
-rw-r--r--include/libyuv/macros_msa.h (renamed from files/include/libyuv/macros_msa.h)45
-rw-r--r--include/libyuv/mjpeg_decoder.h (renamed from files/include/libyuv/mjpeg_decoder.h)0
-rw-r--r--include/libyuv/planar_functions.h (renamed from files/include/libyuv/planar_functions.h)404
-rw-r--r--include/libyuv/rotate.h296
-rw-r--r--include/libyuv/rotate_argb.h (renamed from files/include/libyuv/rotate_argb.h)0
-rw-r--r--include/libyuv/rotate_row.h (renamed from files/include/libyuv/rotate_row.h)109
-rw-r--r--include/libyuv/row.h (renamed from files/include/libyuv/row.h)3299
-rw-r--r--include/libyuv/scale.h321
-rw-r--r--include/libyuv/scale_argb.h (renamed from files/include/libyuv/scale_argb.h)0
-rw-r--r--include/libyuv/scale_rgb.h42
-rw-r--r--include/libyuv/scale_row.h (renamed from files/include/libyuv/scale_row.h)963
-rw-r--r--include/libyuv/scale_uv.h51
-rw-r--r--include/libyuv/version.h (renamed from files/include/libyuv/version.h)2
-rw-r--r--include/libyuv/video_common.h (renamed from files/include/libyuv/video_common.h)53
-rw-r--r--infra/config/OWNERS3
-rw-r--r--infra/config/PRESUBMIT.py13
-rw-r--r--infra/config/README.md2
-rw-r--r--infra/config/codereview.settings (renamed from files/codereview.settings)6
-rw-r--r--infra/config/commit-queue.cfg143
-rw-r--r--infra/config/cr-buildbucket.cfg1704
-rw-r--r--infra/config/luci-logdog.cfg9
-rw-r--r--infra/config/luci-milo.cfg246
-rw-r--r--infra/config/luci-scheduler.cfg385
-rwxr-xr-xinfra/config/main.star344
-rw-r--r--infra/config/project.cfg15
-rw-r--r--infra/config/realms.cfg83
-rw-r--r--libyuv.gni (renamed from files/libyuv.gni)8
-rw-r--r--libyuv.gyp (renamed from files/libyuv.gyp)0
-rw-r--r--libyuv.gypi (renamed from files/libyuv.gypi)8
-rw-r--r--linux.mk (renamed from files/linux.mk)40
-rw-r--r--public.mk2
-rw-r--r--pylintrc (renamed from files/pylintrc)0
-rwxr-xr-xriscv_script/prepare_toolchain_qemu.sh74
-rw-r--r--riscv_script/riscv-clang.cmake55
-rwxr-xr-xriscv_script/run_qemu.sh15
-rw-r--r--source/compare.cc (renamed from files/source/compare.cc)20
-rw-r--r--source/compare_common.cc (renamed from files/source/compare_common.cc)30
-rw-r--r--source/compare_gcc.cc359
-rw-r--r--source/compare_msa.cc (renamed from files/source/compare_msa.cc)0
-rw-r--r--source/compare_neon.cc96
-rw-r--r--source/compare_neon64.cc94
-rw-r--r--source/compare_win.cc (renamed from files/source/compare_win.cc)14
-rw-r--r--source/convert.cc4055
-rw-r--r--source/convert_argb.cc8556
-rw-r--r--source/convert_from.cc910
-rw-r--r--source/convert_from_argb.cc (renamed from files/source/convert_from_argb.cc)1950
-rw-r--r--source/convert_jpeg.cc (renamed from files/source/convert_jpeg.cc)134
-rw-r--r--source/convert_to_argb.cc (renamed from files/source/convert_to_argb.cc)89
-rw-r--r--source/convert_to_i420.cc (renamed from files/source/convert_to_i420.cc)29
-rw-r--r--source/cpu_id.cc (renamed from files/source/cpu_id.cc)181
-rw-r--r--source/mjpeg_decoder.cc (renamed from files/source/mjpeg_decoder.cc)7
-rw-r--r--source/mjpeg_validate.cc (renamed from files/source/mjpeg_validate.cc)0
-rw-r--r--source/planar_functions.cc (renamed from files/source/planar_functions.cc)2934
-rw-r--r--source/rotate.cc1231
-rw-r--r--source/rotate_any.cc (renamed from files/source/rotate_any.cc)12
-rw-r--r--source/rotate_argb.cc (renamed from files/source/rotate_argb.cc)122
-rw-r--r--source/rotate_common.cc198
-rw-r--r--source/rotate_gcc.cc503
-rw-r--r--source/rotate_lsx.cc243
-rw-r--r--source/rotate_msa.cc (renamed from files/source/rotate_msa.cc)0
-rw-r--r--source/rotate_neon.cc (renamed from files/source/rotate_neon.cc)258
-rw-r--r--source/rotate_neon64.cc (renamed from files/source/rotate_neon64.cc)352
-rw-r--r--source/rotate_win.cc (renamed from files/source/rotate_win.cc)5
-rw-r--r--source/row_any.cc2459
-rw-r--r--source/row_common.cc (renamed from files/source/row_common.cc)2832
-rw-r--r--source/row_gcc.cc9744
-rw-r--r--source/row_lasx.cc2304
-rw-r--r--source/row_lsx.cc2987
-rw-r--r--source/row_msa.cc (renamed from files/source/row_msa.cc)1507
-rw-r--r--source/row_neon.cc3999
-rw-r--r--source/row_neon64.cc4630
-rw-r--r--source/row_rvv.cc1394
-rw-r--r--source/row_win.cc (renamed from files/source/row_win.cc)750
-rw-r--r--source/scale.cc (renamed from files/source/scale.cc)1358
-rw-r--r--source/scale_any.cc1078
-rw-r--r--source/scale_argb.cc (renamed from files/source/scale_argb.cc)488
-rw-r--r--source/scale_common.cc (renamed from files/source/scale_common.cc)753
-rw-r--r--source/scale_gcc.cc2953
-rw-r--r--source/scale_lsx.cc739
-rw-r--r--source/scale_msa.cc (renamed from files/source/scale_msa.cc)0
-rw-r--r--source/scale_neon.cc1533
-rw-r--r--source/scale_neon64.cc1578
-rw-r--r--source/scale_rgb.cc66
-rw-r--r--source/scale_rvv.cc1040
-rw-r--r--source/scale_uv.cc1210
-rw-r--r--source/scale_win.cc (renamed from files/source/scale_win.cc)5
-rwxr-xr-xsource/test.sh35
-rw-r--r--source/video_common.cc (renamed from files/source/video_common.cc)0
-rw-r--r--tools_libyuv/OWNERS4
-rwxr-xr-xtools_libyuv/autoroller/roll_deps.py822
-rwxr-xr-xtools_libyuv/autoroller/unittests/roll_deps_test.py (renamed from files/tools_libyuv/autoroller/unittests/roll_deps_test.py)48
-rw-r--r--tools_libyuv/autoroller/unittests/testdata/DEPS (renamed from files/tools_libyuv/autoroller/unittests/testdata/DEPS)1
-rw-r--r--tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new (renamed from files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new)0
-rw-r--r--tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old (renamed from files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old)0
-rwxr-xr-xtools_libyuv/get_landmines.py (renamed from files/tools_libyuv/get_landmines.py)7
-rw-r--r--tools_libyuv/msan/OWNERS3
-rw-r--r--tools_libyuv/msan/blacklist.txt (renamed from files/tools_libyuv/msan/blacklist.txt)0
-rw-r--r--tools_libyuv/ubsan/OWNERS3
-rw-r--r--tools_libyuv/ubsan/blacklist.txt (renamed from files/tools_libyuv/ubsan/blacklist.txt)0
-rw-r--r--tools_libyuv/ubsan/vptr_blacklist.txt (renamed from files/tools_libyuv/ubsan/vptr_blacklist.txt)0
-rw-r--r--unit_test/basictypes_test.cc (renamed from files/unit_test/basictypes_test.cc)0
-rw-r--r--unit_test/color_test.cc (renamed from files/unit_test/color_test.cc)305
-rw-r--r--unit_test/compare_test.cc (renamed from files/unit_test/compare_test.cc)9
-rw-r--r--unit_test/convert_argb_test.cc2700
-rw-r--r--unit_test/convert_test.cc2110
-rw-r--r--unit_test/cpu_test.cc342
-rw-r--r--unit_test/cpu_thread_test.cc (renamed from files/unit_test/cpu_thread_test.cc)4
-rw-r--r--unit_test/math_test.cc (renamed from files/unit_test/math_test.cc)5
-rw-r--r--unit_test/planar_test.cc (renamed from files/unit_test/planar_test.cc)1769
-rw-r--r--unit_test/rotate_argb_test.cc (renamed from files/unit_test/rotate_argb_test.cc)164
-rw-r--r--unit_test/rotate_test.cc962
-rw-r--r--unit_test/scale_argb_test.cc (renamed from files/unit_test/scale_argb_test.cc)167
-rw-r--r--unit_test/scale_plane_test.cc470
-rw-r--r--unit_test/scale_rgb_test.cc280
-rw-r--r--unit_test/scale_test.cc (renamed from files/unit_test/scale_test.cc)873
-rw-r--r--unit_test/scale_uv_test.cc249
-rw-r--r--unit_test/testdata/arm_v7.txt (renamed from files/unit_test/testdata/arm_v7.txt)0
-rw-r--r--unit_test/testdata/juno.txt (renamed from files/unit_test/testdata/juno.txt)0
-rw-r--r--unit_test/testdata/mips.txt7
-rw-r--r--unit_test/testdata/mips_loongson2k.txt5
-rw-r--r--unit_test/testdata/mips_loongson3.txt10
-rw-r--r--unit_test/testdata/mips_loongson_mmi.txt7
-rw-r--r--unit_test/testdata/mips_msa.txt7
-rw-r--r--unit_test/testdata/riscv64.txt4
-rw-r--r--unit_test/testdata/riscv64_rvv.txt4
-rw-r--r--unit_test/testdata/riscv64_rvv_zvfh.txt4
-rw-r--r--unit_test/testdata/tegra3.txt (renamed from files/unit_test/testdata/tegra3.txt)0
-rw-r--r--unit_test/testdata/test0.jpg (renamed from files/unit_test/testdata/test0.jpg)bin421 -> 421 bytes
-rw-r--r--unit_test/testdata/test1.jpg (renamed from files/unit_test/testdata/test1.jpg)bin735 -> 735 bytes
-rw-r--r--unit_test/testdata/test2.jpg (renamed from files/unit_test/testdata/test2.jpg)bin685 -> 685 bytes
-rw-r--r--unit_test/testdata/test3.jpg (renamed from files/unit_test/testdata/test3.jpg)bin704 -> 704 bytes
-rw-r--r--unit_test/testdata/test4.jpg (renamed from files/unit_test/testdata/test4.jpg)bin701 -> 701 bytes
-rw-r--r--unit_test/unit_test.cc (renamed from files/unit_test/unit_test.cc)210
-rw-r--r--unit_test/unit_test.h (renamed from files/unit_test/unit_test.h)22
-rw-r--r--unit_test/video_common_test.cc (renamed from files/unit_test/video_common_test.cc)11
-rw-r--r--util/Makefile (renamed from files/util/Makefile)0
-rw-r--r--util/color.cc120
-rw-r--r--util/compare.cc (renamed from files/util/compare.cc)0
-rw-r--r--util/cpuid.c (renamed from files/util/cpuid.c)81
-rw-r--r--util/i444tonv12_eg.cc28
-rw-r--r--util/psnr.cc (renamed from files/util/psnr.cc)0
-rw-r--r--util/psnr.h (renamed from files/util/psnr.h)0
-rw-r--r--util/psnr_main.cc (renamed from files/util/psnr_main.cc)4
-rw-r--r--util/ssim.cc (renamed from files/util/ssim.cc)0
-rw-r--r--util/ssim.h (renamed from files/util/ssim.h)0
-rw-r--r--util/yuvconstants.c106
-rw-r--r--util/yuvconvert.cc (renamed from files/util/yuvconvert.cc)30
-rw-r--r--winarm.mk (renamed from files/winarm.mk)1
284 files changed, 92486 insertions, 56420 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..59d48705
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+# Defines the Chromium style for automatic reformatting.
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: Chromium
+---
+Language: Java
+BasedOnStyle: Google
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..20d679b7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+*.pyc
+.landmines
+pin-log.txt
+/base
+/build
+/buildtools
+/google_apis
+/links
+/links.db
+/ios
+/mojo
+/native_client
+/net
+/out
+/unit_test/out
+/source/out
+/sde-avx-sse-transition-out.txt
+/testing
+/third_party
+/tools
+
+# Files generated by CMake build
+cmake_install.cmake
+CMakeCache.txt
+CMakeFiles/
+yuvconvert
+libgtest.a
+libyuv.a
+libyuv_unittest
+
+# Files generated by winarm.mk build
+libyuv_arm.lib
+source/*.o
+
+# Files generated by perf
+perf.data
+perf.data.old
diff --git a/files/.gn b/.gn
index 63dad32d..f9a5ee6c 100644
--- a/files/.gn
+++ b/.gn
@@ -6,9 +6,15 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
+import("//build/dotfile_settings.gni")
+
# The location of the build configuration file.
buildconfig = "//build/config/BUILDCONFIG.gn"
+# The python interpreter to use by default. On Windows, this will look
+# for python3.exe and python3.bat.
+script_executable = "python3"
+
# The secondary source root is a parallel directory tree where
# GN build files are placed when they can not be placed directly
# in the source tree, e.g. for third party source trees.
@@ -23,24 +29,10 @@ check_targets = [ "//libyuv/*" ]
# These are the list of GN files that run exec_script. This whitelist exists
# to force additional review for new uses of exec_script, which is strongly
# discouraged except for gypi_to_gn calls.
-exec_script_whitelist = [
- "//build/config/BUILD.gn",
- "//build/config/android/BUILD.gn",
- "//build/config/android/config.gni",
- "//build/config/android/internal_rules.gni",
- "//build/config/android/rules.gni",
- "//build/config/compiler/BUILD.gn",
- "//build/config/gcc/gcc_version.gni",
- "//build/config/ios/ios_sdk.gni",
- "//build/config/linux/BUILD.gn",
- "//build/config/linux/pkg_config.gni",
- "//build/config/mac/mac_sdk.gni",
- "//build/config/posix/BUILD.gn",
- "//build/config/sysroot.gni",
- "//build/config/win/visual_studio_version.gni",
- "//build/gn_helpers.py",
- "//build/gypi_to_gn.py",
- "//build/toolchain/gcc_toolchain.gni",
- "//build/toolchain/mac/BUILD.gn",
- "//build/toolchain/win/BUILD.gn",
-]
+exec_script_whitelist = build_dotfile_settings.exec_script_whitelist +
+ [ "//build_overrides/build.gni" ]
+
+default_args = {
+ mac_sdk_min = "10.12"
+ ios_deployment_target = "12.0"
+}
diff --git a/.vpython b/.vpython
new file mode 100644
index 00000000..4a64fd21
--- /dev/null
+++ b/.vpython
@@ -0,0 +1,52 @@
+# This is a vpython "spec" file.
+#
+# It describes patterns for python wheel dependencies of the python scripts in
+# the chromium repo, particularly for dependencies that have compiled components
+# (since pure-python dependencies can be easily vendored into third_party).
+#
+# When vpython is invoked, it finds this file and builds a python VirtualEnv,
+# containing all of the dependencies described in this file, fetching them from
+# CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`,
+# this never requires the end-user machine to have a working python extension
+# compilation environment. All of these packages are built using:
+# https://chromium.googlesource.com/infra/infra/+/master/infra/tools/dockerbuild/
+#
+# All python scripts in the repo share this same spec, to avoid dependency
+# fragmentation.
+#
+# If you have depot_tools installed in your $PATH, you can invoke python scripts
+# in this repo by running them as you normally would run them, except
+# substituting `vpython` instead of `python` on the command line, e.g.:
+# vpython path/to/script.py some --arguments
+#
+# Read more about `vpython` and how to modify this file here:
+# https://chromium.googlesource.com/infra/infra/+/master/doc/users/vpython.md
+
+python_version: "2.7"
+
+# Used by:
+# third_party/catapult
+wheel: <
+ name: "infra/python/wheels/psutil/${platform}_${py_python}_${py_abi}"
+ version: "version:5.2.2"
+>
+
+# Used by:
+# third_party/catapult
+wheel: <
+ name: "infra/python/wheels/pypiwin32/${vpython_platform}"
+ version: "version:219"
+ match_tag: <
+ platform: "win32"
+ >
+ match_tag: <
+ platform: "win_amd64"
+ >
+>
+
+# Used by:
+# build/android
+wheel: <
+ name: "infra/python/wheels/requests-py2_py3"
+ version: "version:2.13.0"
+>
diff --git a/.vpython3 b/.vpython3
new file mode 100644
index 00000000..28d819e7
--- /dev/null
+++ b/.vpython3
@@ -0,0 +1,405 @@
+# This is a vpython "spec" file.
+#
+# It describes patterns for python wheel dependencies of the python scripts in
+# the chromium repo, particularly for dependencies that have compiled components
+# (since pure-python dependencies can be easily vendored into third_party).
+#
+# When vpython is invoked, it finds this file and builds a python VirtualEnv,
+# containing all of the dependencies described in this file, fetching them from
+# CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`,
+# this never requires the end-user machine to have a working python extension
+# compilation environment. All of these packages are built using:
+# https://chromium.googlesource.com/infra/infra/+/main/infra/tools/dockerbuild/
+#
+# All python scripts in the repo share this same spec, to avoid dependency
+# fragmentation.
+#
+# If you have depot_tools installed in your $PATH, you can invoke python scripts
+# in this repo by running them as you normally would run them, except
+# substituting `vpython` instead of `python` on the command line, e.g.:
+# vpython path/to/script.py some --arguments
+#
+# Read more about `vpython` and how to modify this file here:
+# https://chromium.googlesource.com/infra/infra/+/main/doc/users/vpython.md
+
+python_version: "3.8"
+
+# The default set of platforms vpython checks does not yet include mac-arm64.
+# Setting `verify_pep425_tag` to the list of platforms we explicitly must support
+# allows us to ensure that vpython specs stay mac-arm64-friendly
+verify_pep425_tag: [
+ {python: "cp38", abi: "cp38", platform: "manylinux1_x86_64"},
+ {python: "cp38", abi: "cp38", platform: "linux_arm64"},
+
+ {python: "cp38", abi: "cp38", platform: "macosx_10_10_intel"},
+ {python: "cp38", abi: "cp38", platform: "macosx_11_0_arm64"},
+
+ {python: "cp38", abi: "cp38", platform: "win32"},
+ {python: "cp38", abi: "cp38", platform: "win_amd64"}
+]
+
+# Used by:
+# build/android/pylib/local/emulator/avd.py
+# components/policy/test_support/policy_testserver.py
+wheel: <
+ name: "infra/python/wheels/protobuf-py2_py3"
+ version: "version:3.15.8"
+>
+
+# TODO(https://crbug.com/898348): Add in necessary wheels as Python3 versions
+# become available.
+wheel: <
+ name: "infra/python/wheels/six-py2_py3"
+ version: "version:1.15.0"
+>
+
+# Common utilities.
+# Use the same versions specified by //third_party/catapult/.vpython3 so that
+# Chromium tests using Telemetry function properly.
+wheel: <
+ name: "infra/python/wheels/numpy/${vpython_platform}"
+ version: "version:1.20.3"
+ # A newer version of numpy is required on ARM64, but it breaks older OS versions.
+ not_match_tag <
+ platform: "macosx_11_0_arm64"
+ >
+>
+wheel: <
+ name: "infra/python/wheels/numpy/mac-arm64_cp38_cp38"
+ version: "version:1.21.1"
+ match_tag <
+ platform: "macosx_11_0_arm64"
+ >
+>
+wheel: <
+ name: "infra/python/wheels/psutil/${vpython_platform}"
+ version: "version:5.8.0.chromium.2"
+>
+wheel: <
+ name: "infra/python/wheels/requests-py3"
+ version: "version:2.31.0"
+>
+
+# Used by various python unit tests.
+wheel: <
+ name: "infra/python/wheels/mock-py2_py3"
+ version: "version:2.0.0"
+>
+wheel: <
+ name: "infra/python/wheels/parameterized-py2_py3"
+ version: "version:0.7.1"
+>
+wheel: <
+ name: "infra/python/wheels/pbr-py2_py3"
+ version: "version:3.0.0"
+>
+
+wheel: <
+ name: "infra/python/wheels/pyfakefs-py2_py3"
+ version: "version:3.7.2"
+>
+
+# Used by:
+# build/chromeos/test_runner.py
+wheel: <
+ name: "infra/python/wheels/jsonlines-py2_py3"
+ version: "version:1.2.0"
+>
+wheel: <
+ name: "infra/python/wheels/python-dateutil-py2_py3"
+ version: "version:2.7.3"
+>
+
+# Used by WPT importer
+wheel: <
+ name: "infra/python/wheels/charset_normalizer-py3"
+ version: "version:2.0.4"
+>
+wheel: <
+ name: "infra/python/wheels/pyasn1-py2_py3"
+ version: "version:0.4.5"
+>
+wheel: <
+ name: "infra/python/wheels/pyasn1_modules-py2_py3"
+ version: "version:0.2.4"
+>
+wheel: <
+ name: "infra/python/wheels/rsa-py2_py3"
+ version: "version:3.4.2"
+>
+wheel: <
+ name: "infra/python/wheels/cachetools-py2_py3"
+ version: "version:2.0.1"
+>
+wheel: <
+ name: "infra/python/wheels/uritemplate-py2_py3"
+ version: "version:3.0.0"
+>
+wheel: <
+ name: "infra/python/wheels/google-auth-py2_py3"
+ version: "version:1.25.0"
+>
+wheel: <
+ name: "infra/python/wheels/googleapis-common-protos-py2_py3"
+ version: "version:1.52.0"
+>
+wheel: <
+ name: "infra/python/wheels/google-api-core-py2_py3"
+ version: "version:1.25.1"
+>
+wheel: <
+ name: "infra/python/wheels/google-auth-httplib2-py2_py3"
+ version: "version:0.1.0"
+>
+wheel: <
+ name: "infra/python/wheels/google-api-python-client-py3"
+ version: "version:2.2.0"
+>
+wheel: <
+ name: "infra/python/wheels/oauth2client-py2_py3"
+ version: "version:3.0.0"
+>
+
+# Used by Web Platform Tests (WPT) codebase in
+# //third_party/blink/web_tests/external/wpt/tools/
+wheel: <
+ name: "infra/python/wheels/html5lib-py2_py3"
+ version: "version:1.0.1"
+>
+wheel: <
+ name: "infra/python/wheels/mozdebug-py2_py3"
+ version: "version:0.2"
+>
+wheel: <
+ name: "infra/python/wheels/mozinfo-py2_py3"
+ version: "version:1.2.2"
+>
+wheel: <
+ name: "infra/python/wheels/mozlog-py2_py3"
+ version: "version:7.1.0"
+>
+wheel: <
+ name: "infra/python/wheels/mozprocess-py2_py3"
+ version: "version:1.2.1"
+>
+wheel: <
+ name: "infra/python/wheels/urllib3-py2_py3"
+ version: "version:1.24.3"
+>
+wheel: <
+ name: "infra/python/wheels/blessings-py2_py3"
+ version: "version:1.7"
+>
+wheel: <
+ name: "infra/python/wheels/mozfile-py2_py3"
+ version: "version:2.0.0"
+>
+wheel: <
+ name: "infra/python/wheels/mozterm-py2_py3"
+ version: "version:1.0.0"
+>
+wheel: <
+ name: "infra/python/wheels/webencodings-py2_py3"
+ version: "version:0.5.1"
+>
+wheel: <
+ name: "infra/python/wheels/certifi-py2_py3"
+ version: "version:2020.11.8"
+>
+wheel: <
+ name: "infra/python/wheels/chardet-py2_py3"
+ version: "version:3.0.4"
+>
+wheel: <
+ name: "infra/python/wheels/idna-py2_py3"
+ version: "version:2.8"
+>
+wheel: <
+ name: "infra/python/wheels/distro-py2_py3"
+ version: "version:1.4.0"
+>
+wheel: <
+ name: "infra/python/wheels/pillow/linux-amd64_cp38_cp38"
+ version: "version:8.1.2"
+>
+wheel: <
+ name: "infra/python/wheels/aioquic/${vpython_platform}"
+ version: "version:0.9.15"
+>
+wheel: <
+ name: "infra/python/wheels/pylsqpack/${vpython_platform}"
+ version: "version:0.3.12"
+>
+wheel: <
+ name: "infra/python/wheels/cryptography/${vpython_platform}"
+ version: "version:3.3.1.chromium.1"
+>
+wheel: <
+ name: "infra/python/wheels/cffi/${vpython_platform}"
+ version: "version:1.14.5"
+>
+wheel: <
+ name: "infra/python/wheels/pycparser-py2_py3"
+ version: "version:2.19"
+>
+
+# Used by:
+# chrome/test/chromedriver/test/run_webdriver_tests.py
+wheel: <
+ name: "infra/python/wheels/iniconfig-py3"
+ version: "version:1.1.1"
+>
+
+wheel: <
+ name: "infra/python/wheels/packaging-py2_py3"
+ version: "version:16.8"
+>
+
+wheel: <
+ name: "infra/python/wheels/pyparsing-py2_py3"
+ version: "version:2.4.7"
+>
+
+wheel: <
+ name: "infra/python/wheels/toml-py3"
+ version: "version:0.10.1"
+>
+
+wheel <
+ name: "infra/python/wheels/pytest-py3"
+ version: "version:6.2.2"
+>
+
+wheel <
+ name: "infra/python/wheels/pytest-asyncio-py3"
+ version: "version:0.14.0"
+>
+
+wheel <
+ name: "infra/python/wheels/attrs-py2_py3"
+ version: "version:20.3.0"
+>
+
+wheel <
+ name: "infra/python/wheels/six-py2_py3"
+ version: "version:1.15.0"
+>
+
+wheel <
+ name: "infra/python/wheels/more-itertools-py2_py3"
+ version: "version:4.1.0"
+>
+
+wheel <
+ name: "infra/python/wheels/pluggy-py3"
+ version: "version:0.13.1"
+>
+
+wheel <
+ name: "infra/python/wheels/py-py2_py3"
+ version: "version:1.10.0"
+>
+
+wheel <
+ name: "infra/python/wheels/funcsigs-py2_py3"
+ version: "version:1.0.2"
+>
+
+wheel: <
+ name: "infra/python/wheels/atomicwrites-py2_py3"
+ version: "version:1.3.0"
+>
+
+wheel: <
+ name: "infra/python/wheels/colorama-py2_py3"
+ version: "version:0.4.1"
+>
+
+# Used by:
+# testing/buildbot/generate_buildbot_json_coveragetest.py
+wheel: <
+ name: "infra/python/wheels/coverage/${vpython_platform}"
+ version: "version:5.5.chromium.2"
+>
+
+# Used by:
+# //content/test/gpu
+wheel: <
+ name: "infra/python/wheels/pathos/${vpython_platform}"
+ version: "version:0.2.7.chromium.4"
+ not_match_tag <
+ abi: "cp27mu"
+ platform: "manylinux1_i686"
+ >
+ not_match_tag <
+ abi: "cp27mu"
+ platform: "linux_mips64"
+ >
+ not_match_tag <
+ abi: "cp27mu"
+ platform: "linux_armv6l"
+ >
+ not_match_tag <
+ abi: "cp27mu"
+ platform: "linux_armv7l"
+ >
+>
+
+# Used by:
+# //tools/infra/find_bad_builds.py
+wheel: <
+ name: "infra/python/wheels/pytz-py2_py3"
+ version: "version:2018.4"
+>
+
+# Used by:
+# //third_party/blink/tools/blinkpy/web_tests/port/server_process.py
+wheel: <
+ name: "infra/python/wheels/pywin32/${vpython_platform}"
+ version: "version:300"
+ match_tag: <
+ platform: "win32"
+ >
+ match_tag: <
+ platform: "win_amd64"
+ >
+>
+
+# Used by:
+# //content/test/gpu/gpu_tests/color_profile_manager_mac.py
+wheel: <
+ name: "infra/python/wheels/pyobjc/${vpython_platform}"
+ version: "version:7.3.chromium.1"
+ match_tag: <
+ platform: "macosx_10_10_intel"
+ >
+>
+
+# Used by:
+# tools/perf/core/results_dashboard.py
+wheel: <
+ name: "infra/python/wheels/httplib2-py3"
+ version: "version:0.19.1"
+>
+
+# Used by:
+# tools/perf/flakiness_cli
+wheel: <
+ name: "infra/python/wheels/pandas/${vpython_platform}"
+ version: "version:1.3.2.chromium.1"
+ match_tag: <
+ platform: "win32"
+ >
+ match_tag: <
+ platform: "win_amd64"
+ >
+ match_tag: <
+ platform: "manylinux1_i686"
+ >
+ match_tag: <
+ platform: "manylinux1_x86_64"
+ >
+ match_tag: <
+ platform: "macosx_10_6_intel"
+ >
+>
diff --git a/files/AUTHORS b/AUTHORS
index 9686ac13..28c08956 100644
--- a/files/AUTHORS
+++ b/AUTHORS
@@ -2,3 +2,5 @@
# Name or Organization <email address>
Google Inc.
+
+Ivan Pavlotskiy <ivan.pavlotskiy@lgepartner.com>
diff --git a/Android.bp b/Android.bp
index e4ed511c..506184e0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,7 +1,6 @@
package {
default_applicable_licenses: ["external_libyuv_license"],
}
-
// Added automatically by a large-scale-change
// See: http://go/android-license-faq
license {
@@ -12,7 +11,183 @@ license {
],
license_text: [
"LICENSE",
+ "PATENTS",
],
}
-
subdirs = ["files"]
+
+cc_library {
+ name: "libyuv",
+ vendor_available: true,
+ product_available: true,
+ host_supported: true,
+
+ srcs: [
+ "source/compare.cc",
+ "source/compare_common.cc",
+ "source/compare_gcc.cc",
+ "source/compare_msa.cc",
+ "source/compare_neon.cc",
+ "source/compare_neon64.cc",
+ "source/convert.cc",
+ "source/convert_argb.cc",
+ "source/convert_from.cc",
+ "source/convert_from_argb.cc",
+ "source/convert_jpeg.cc",
+ "source/convert_to_argb.cc",
+ "source/convert_to_i420.cc",
+ "source/cpu_id.cc",
+ "source/mjpeg_decoder.cc",
+ "source/mjpeg_validate.cc",
+ "source/planar_functions.cc",
+ "source/rotate.cc",
+ "source/rotate_any.cc",
+ "source/rotate_argb.cc",
+ "source/rotate_common.cc",
+ "source/rotate_gcc.cc",
+ "source/rotate_msa.cc",
+ "source/rotate_neon.cc",
+ "source/rotate_neon64.cc",
+ "source/row_any.cc",
+ "source/row_common.cc",
+ "source/row_gcc.cc",
+ "source/row_msa.cc",
+ "source/row_neon.cc",
+ "source/row_neon64.cc",
+ "source/row_rvv.cc",
+ "source/scale.cc",
+ "source/scale_any.cc",
+ "source/scale_argb.cc",
+ "source/scale_common.cc",
+ "source/scale_gcc.cc",
+ "source/scale_msa.cc",
+ "source/scale_neon.cc",
+ "source/scale_neon64.cc",
+ "source/scale_rgb.cc",
+ "source/scale_rvv.cc",
+ "source/scale_uv.cc",
+ "source/video_common.cc",
+ ],
+
+ cflags: [
+ "-Wall",
+ "-Werror",
+ "-Wno-unused-parameter",
+ "-fexceptions",
+ "-DHAVE_JPEG",
+ "-DLIBYUV_UNLIMITED_DATA",
+ ],
+
+ arch: {
+ arm: {
+ cflags: ["-mfpu=neon"],
+ },
+ },
+
+ shared_libs: ["libjpeg"],
+
+ export_include_dirs: ["include"],
+
+ apex_available: [
+ "//apex_available:platform",
+ "com.android.media.swcodec",
+ "com.android.virt",
+ ],
+ min_sdk_version: "29",
+}
+
+// compatibilty static library until all uses of libyuv_static are replaced
+// with libyuv (b/37646797)
+cc_library_static {
+ name: "libyuv_static",
+ vendor_available: true,
+ whole_static_libs: ["libyuv"],
+ apex_available: [
+ "//apex_available:platform",
+ "com.android.media.swcodec",
+ ],
+ min_sdk_version: "29",
+}
+
+cc_test {
+ name: "libyuv_unittest",
+ static_libs: ["libyuv"],
+ shared_libs: ["libjpeg"],
+ cflags: ["-Wall", "-Werror"],
+ srcs: [
+ "unit_test/basictypes_test.cc",
+ "unit_test/color_test.cc",
+ "unit_test/compare_test.cc",
+ "unit_test/convert_test.cc",
+ "unit_test/cpu_test.cc",
+ "unit_test/cpu_thread_test.cc",
+ "unit_test/math_test.cc",
+ "unit_test/planar_test.cc",
+ "unit_test/rotate_argb_test.cc",
+ "unit_test/rotate_test.cc",
+ "unit_test/scale_argb_test.cc",
+ "unit_test/scale_plane_test.cc",
+ "unit_test/scale_rgb_test.cc",
+ "unit_test/scale_test.cc",
+ "unit_test/scale_uv_test.cc",
+ "unit_test/unit_test.cc",
+ "unit_test/video_common_test.cc",
+ ],
+}
+
+cc_test {
+ name: "compare",
+ gtest: false,
+ srcs: [
+ "util/compare.cc",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "cpuid",
+ gtest: false,
+ srcs: [
+ "util/cpuid.c",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "i444tonv12_eg",
+ gtest: false,
+ srcs: [
+ "util/i444tonv12_eg.cc",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "psnr",
+ gtest: false,
+ srcs: [
+ "util/psnr_main.cc",
+ "util/psnr.cc",
+ "util/ssim.cc",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "yuvconstants",
+ gtest: false,
+ srcs: [
+ "util/yuvconstants.c",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "yuvconvert",
+ gtest: false,
+ srcs: [
+ "util/yuvconvert.cc",
+ ],
+ static_libs: ["libyuv"],
+ shared_libs: ["libjpeg"],
+}
diff --git a/BUILD b/BUILD
deleted file mode 100644
index 3145e36a..00000000
--- a/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright 2011 Google Inc. All Rights Reserved.
-#
-# Description:
-# The libyuv package provides implementation yuv image conversion and
-# scaling.
-#
-# This library is used by Talk Video and WebRTC.
-#
-
-licenses(['notice']) # 3-clause BSD
-
-exports_files(['LICENSE'])
-
-package(default_visibility = ['//visibility:public'])
diff --git a/files/BUILD.gn b/BUILD.gn
index 8904fd6c..2c600b22 100644
--- a/files/BUILD.gn
+++ b/BUILD.gn
@@ -6,12 +6,13 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-import("libyuv.gni")
+import("//build/config/features.gni")
import("//testing/test.gni")
+import("libyuv.gni")
declare_args() {
- # Set to false to disable building with gflags.
- libyuv_use_gflags = true
+ # Set to false to disable building with absl flags.
+ libyuv_use_absl_flags = true
# When building a shared library using a target in WebRTC or
# Chromium projects that depends on libyuv, setting this flag
@@ -21,26 +22,40 @@ declare_args() {
config("libyuv_config") {
include_dirs = [ "include" ]
- if (is_android && current_cpu == "arm64") {
- ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+ if (is_android) {
+ if (target_cpu == "arm" || target_cpu == "x86" || target_cpu == "mipsel") {
+ ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+ } else {
+ ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+ }
+ }
+ defines = []
+ if (!libyuv_use_neon) {
+ defines += [ "LIBYUV_DISABLE_NEON" ]
+ }
+ if (libyuv_disable_rvv) {
+ defines += [ "LIBYUV_DISABLE_RVV" ]
+ }
+ if (!libyuv_use_lsx) {
+ defines += [ "LIBYUV_DISABLE_LSX" ]
}
- if (is_android && current_cpu != "arm64") {
- ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+ if (!libyuv_use_lasx) {
+ defines += [ "LIBYUV_DISABLE_LASX" ]
}
}
# This target is built when no specific target is specified on the command line.
group("default") {
testonly = true
- deps = [
- ":libyuv",
- ]
+ deps = [ ":libyuv" ]
if (libyuv_include_tests) {
deps += [
":compare",
":cpuid",
+ ":i444tonv12_eg",
":libyuv_unittest",
":psnr",
+ ":yuvconstants",
":yuvconvert",
]
}
@@ -52,13 +67,9 @@ group("libyuv") {
if (is_win && target_cpu == "x64") {
# Compile with clang in order to get inline assembly
- public_deps = [
- ":libyuv_internal(//build/toolchain/win:win_clang_x64)",
- ]
+ public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ]
} else {
- public_deps = [
- ":libyuv_internal",
- ]
+ public_deps = [ ":libyuv_internal" ]
}
if (libyuv_use_neon) {
@@ -69,11 +80,15 @@ group("libyuv") {
deps += [ ":libyuv_msa" ]
}
- if (libyuv_use_mmi) {
- deps += [ ":libyuv_mmi" ]
+ if (libyuv_use_lsx) {
+ deps += [ ":libyuv_lsx" ]
}
- if (!is_ios) {
+ if (libyuv_use_lasx) {
+ deps += [ ":libyuv_lasx" ]
+ }
+
+ if (!is_ios && !libyuv_disable_jpeg) {
# Make sure that clients of libyuv link with libjpeg. This can't go in
# libyuv_internal because in Windows x64 builds that will generate a clang
# build of libjpeg, and we don't want two copies.
@@ -102,7 +117,9 @@ static_library("libyuv_internal") {
"include/libyuv/row.h",
"include/libyuv/scale.h",
"include/libyuv/scale_argb.h",
+ "include/libyuv/scale_rgb.h",
"include/libyuv/scale_row.h",
+ "include/libyuv/scale_uv.h",
"include/libyuv/version.h",
"include/libyuv/video_common.h",
@@ -131,12 +148,16 @@ static_library("libyuv_internal") {
"source/row_any.cc",
"source/row_common.cc",
"source/row_gcc.cc",
+ "source/row_rvv.cc",
"source/row_win.cc",
"source/scale.cc",
"source/scale_any.cc",
"source/scale_argb.cc",
"source/scale_common.cc",
"source/scale_gcc.cc",
+ "source/scale_rgb.cc",
+ "source/scale_rvv.cc",
+ "source/scale_uv.cc",
"source/scale_win.cc",
"source/video_common.cc",
]
@@ -150,7 +171,7 @@ static_library("libyuv_internal") {
configs += [ "//build/config/gcc:symbol_visibility_default" ]
}
- if (!is_ios) {
+ if ((!is_ios || use_blink) && !libyuv_disable_jpeg) {
defines += [ "HAVE_JPEG" ]
# Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
@@ -192,9 +213,7 @@ if (libyuv_use_neon) {
"source/scale_neon64.cc",
]
- deps = [
- ":libyuv_internal",
- ]
+ deps = [ ":libyuv_internal" ]
public_configs = [ ":libyuv_config" ]
@@ -225,28 +244,46 @@ if (libyuv_use_msa) {
"source/scale_msa.cc",
]
- deps = [
- ":libyuv_internal",
+ deps = [ ":libyuv_internal" ]
+
+ public_configs = [ ":libyuv_config" ]
+ }
+}
+
+if (libyuv_use_lsx) {
+ static_library("libyuv_lsx") {
+ sources = [
+ # LSX Source Files
+ "source/rotate_lsx.cc",
+ "source/row_lsx.cc",
+ "source/scale_lsx.cc",
+ ]
+
+ cflags_cc = [
+ "-mlsx",
+ "-Wno-c++11-narrowing",
]
+ deps = [ ":libyuv_internal" ]
+
public_configs = [ ":libyuv_config" ]
}
}
-if (libyuv_use_mmi) {
- static_library("libyuv_mmi") {
+if (libyuv_use_lasx) {
+ static_library("libyuv_lasx") {
sources = [
- # MMI Source Files
- "source/compare_mmi.cc",
- "source/rotate_mmi.cc",
- "source/row_mmi.cc",
- "source/scale_mmi.cc",
+ # LASX Source Files
+ "source/row_lasx.cc",
]
- deps = [
- ":libyuv_internal",
+ cflags_cc = [
+ "-mlasx",
+ "-Wno-c++11-narrowing",
]
+ deps = [ ":libyuv_internal" ]
+
public_configs = [ ":libyuv_config" ]
}
}
@@ -275,11 +312,10 @@ if (libyuv_include_tests) {
testonly = true
sources = [
- # sources
- # headers
"unit_test/basictypes_test.cc",
"unit_test/color_test.cc",
"unit_test/compare_test.cc",
+ "unit_test/convert_argb_test.cc",
"unit_test/convert_test.cc",
"unit_test/cpu_test.cc",
"unit_test/cpu_thread_test.cc",
@@ -288,7 +324,10 @@ if (libyuv_include_tests) {
"unit_test/rotate_argb_test.cc",
"unit_test/rotate_test.cc",
"unit_test/scale_argb_test.cc",
+ "unit_test/scale_plane_test.cc",
+ "unit_test/scale_rgb_test.cc",
"unit_test/scale_test.cc",
+ "unit_test/scale_uv_test.cc",
"unit_test/unit_test.cc",
"unit_test/unit_test.h",
"unit_test/video_common_test.cc",
@@ -300,19 +339,20 @@ if (libyuv_include_tests) {
]
defines = []
- if (libyuv_use_gflags) {
- defines += [ "LIBYUV_USE_GFLAGS" ]
- deps += [ "//third_party/gflags" ]
+ if (libyuv_use_absl_flags) {
+ defines += [ "LIBYUV_USE_ABSL_FLAGS" ]
+ deps += [
+ "//third_party/abseil-cpp/absl/flags:flag",
+ "//third_party/abseil-cpp/absl/flags:parse",
+ ]
}
configs += [ ":libyuv_unittest_warnings_config" ]
- public_deps = [
- "//testing/gtest",
- ]
+ public_deps = [ "//testing/gtest" ]
public_configs = [ ":libyuv_unittest_config" ]
- if (is_linux) {
+ if (is_linux || is_chromeos) {
cflags = [ "-fexceptions" ]
}
if (is_ios) {
@@ -349,10 +389,8 @@ if (libyuv_include_tests) {
# sources
"util/compare.cc",
]
- deps = [
- ":libyuv",
- ]
- if (is_linux) {
+ deps = [ ":libyuv" ]
+ if (is_linux || is_chromeos) {
cflags = [ "-fexceptions" ]
}
}
@@ -362,10 +400,19 @@ if (libyuv_include_tests) {
# sources
"util/yuvconvert.cc",
]
- deps = [
- ":libyuv",
+ deps = [ ":libyuv" ]
+ if (is_linux || is_chromeos) {
+ cflags = [ "-fexceptions" ]
+ }
+ }
+
+ executable("yuvconstants") {
+ sources = [
+ # sources
+ "util/yuvconstants.c",
]
- if (is_linux) {
+ deps = [ ":libyuv" ]
+ if (is_linux || is_chromeos) {
cflags = [ "-fexceptions" ]
}
}
@@ -377,22 +424,26 @@ if (libyuv_include_tests) {
"util/psnr_main.cc",
"util/ssim.cc",
]
- deps = [
- ":libyuv",
- ]
+ deps = [ ":libyuv" ]
if (!is_ios && !libyuv_disable_jpeg) {
defines = [ "HAVE_JPEG" ]
}
}
+ executable("i444tonv12_eg") {
+ sources = [
+ # sources
+ "util/i444tonv12_eg.cc",
+ ]
+ deps = [ ":libyuv" ]
+ }
+
executable("cpuid") {
sources = [
# sources
"util/cpuid.c",
]
- deps = [
- ":libyuv",
- ]
+ deps = [ ":libyuv" ]
}
}
diff --git a/files/CM_linux_packages.cmake b/CM_linux_packages.cmake
index 5f676f89..a073edfa 100644
--- a/files/CM_linux_packages.cmake
+++ b/CM_linux_packages.cmake
@@ -8,7 +8,7 @@ SET ( YUV_VER_MAJOR 0 )
SET ( YUV_VER_MINOR 0 )
SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} )
SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} )
-MESSAGE ( "Building ver.: ${YUV_VERSION}" )
+MESSAGE ( VERBOSE "Building ver.: ${YUV_VERSION}" )
# is this a 32-bit or 64-bit build?
IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
@@ -45,7 +45,7 @@ ELSE ()
SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" )
ENDIF ()
ENDIF ()
-MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" )
+MESSAGE ( VERBOSE "Packaging for: ${YUV_SYSTEM_NAME}" )
# define all the variables needed by CPack to create .deb and .rpm packages
SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" )
diff --git a/files/CMakeLists.txt b/CMakeLists.txt
index 188a26b7..9abfa74b 100644
--- a/files/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@
PROJECT ( YUV C CXX ) # "C" is required even for C++ projects
CMAKE_MINIMUM_REQUIRED( VERSION 2.8.12 )
-OPTION( TEST "Built unit tests" OFF )
+OPTION( UNIT_TEST "Built unit tests" OFF )
SET ( ly_base_dir ${PROJECT_SOURCE_DIR} )
SET ( ly_src_dir ${ly_base_dir}/source )
@@ -22,6 +22,10 @@ LIST ( SORT ly_unittest_sources )
INCLUDE_DIRECTORIES( BEFORE ${ly_inc_dir} )
+if(MSVC)
+ ADD_DEFINITIONS ( -D_CRT_SECURE_NO_WARNINGS )
+endif()
+
# this creates the static library (.a)
ADD_LIBRARY ( ${ly_lib_static} STATIC ${ly_source_files} )
@@ -29,23 +33,36 @@ ADD_LIBRARY ( ${ly_lib_static} STATIC ${ly_source_files} )
ADD_LIBRARY ( ${ly_lib_shared} SHARED ${ly_source_files} )
SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}" )
SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES PREFIX "lib" )
+if(WIN32)
+ SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES IMPORT_PREFIX "lib" )
+endif()
+
+# this creates the cpuid tool
+ADD_EXECUTABLE ( cpuid ${ly_base_dir}/util/cpuid.c )
+TARGET_LINK_LIBRARIES ( cpuid ${ly_lib_static} )
# this creates the conversion tool
ADD_EXECUTABLE ( yuvconvert ${ly_base_dir}/util/yuvconvert.cc )
TARGET_LINK_LIBRARIES ( yuvconvert ${ly_lib_static} )
+# this creates the yuvconstants tool
+ADD_EXECUTABLE ( yuvconstants ${ly_base_dir}/util/yuvconstants.c )
+TARGET_LINK_LIBRARIES ( yuvconstants ${ly_lib_static} )
-INCLUDE ( FindJPEG )
+find_package ( JPEG )
if (JPEG_FOUND)
include_directories( ${JPEG_INCLUDE_DIR} )
- target_link_libraries( yuvconvert ${JPEG_LIBRARY} )
+ target_link_libraries( ${ly_lib_shared} ${JPEG_LIBRARY} )
add_definitions( -DHAVE_JPEG )
endif()
-if(TEST)
+if(UNIT_TEST)
find_library(GTEST_LIBRARY gtest)
if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+ if (CMAKE_CROSSCOMPILING)
+ set(GTEST_SRC_DIR third_party/googletest/src/googletest)
+ endif()
if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
@@ -54,7 +71,7 @@ if(TEST)
include_directories(${GTEST_SRC_DIR}/include)
set(GTEST_LIBRARY gtest)
else()
- message(FATAL_ERROR "TEST is set but unable to find gtest library")
+ message(FATAL_ERROR "UNIT_TEST is set but unable to find gtest library")
endif()
endif()
diff --git a/DEPS b/DEPS
new file mode 100644
index 00000000..70ed1d58
--- /dev/null
+++ b/DEPS
@@ -0,0 +1,2576 @@
+gclient_gn_args_file = 'src/build/config/gclient_args.gni'
+gclient_gn_args = [
+ 'generate_location_tags',
+]
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+ 'chromium_revision': 'af3d01376bec75a68f90160bfd38057d60510a2b',
+ 'gn_version': 'git_revision:fae280eabe5d31accc53100137459ece19a7a295',
+ # ninja CIPD package version.
+ # https://chrome-infra-packages.appspot.com/p/infra/3pp/tools/ninja
+ 'ninja_version': 'version:2@1.11.1.chromium.6',
+ # reclient CIPD package version
+ 'reclient_version': 're_client_version:0.110.0.43ec6b1-gomaip',
+
+ # Keep the Chromium default of generating location tags.
+ 'generate_location_tags': True,
+
+ # By default, download the fuchsia sdk from the public sdk directory.
+ 'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/core/',
+ 'fuchsia_version': 'version:15.20230909.2.1',
+ # By default, download the fuchsia images from the fuchsia GCS bucket.
+ 'fuchsia_images_bucket': 'fuchsia',
+ 'checkout_fuchsia': False,
+ # Since the images are hundreds of MB, default to only downloading the image
+ # most commonly useful for developers. Bots and developers that need to use
+ # other images can override this with additional images.
+ 'checkout_fuchsia_boot_images': "terminal.qemu-x64,terminal.x64",
+ 'checkout_fuchsia_product_bundles': '"{checkout_fuchsia_boot_images}" != ""',
+}
+
+deps = {
+ 'src/build':
+ Var('chromium_git') + '/chromium/src/build' + '@' + '5885d3c24833ad72845a52a1b913a2b8bc651b56',
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/src/buildtools' + '@' + '79ab87fa54614258c4c95891e873223371194525',
+ 'src/testing':
+ Var('chromium_git') + '/chromium/src/testing' + '@' + '51e9a02297057cc0e917763a51e16680b7d16fb6',
+ 'src/third_party':
+ Var('chromium_git') + '/chromium/src/third_party' + '@' + '2dc4b18abd1003ce7b1eda509dc96f12d49a9667',
+
+ 'src/buildtools/linux64': {
+ 'packages': [
+ {
+ 'package': 'gn/gn/linux-${{arch}}',
+ 'version': Var('gn_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ 'condition': 'host_os == "linux"',
+ },
+
+ 'src/buildtools/mac': {
+ 'packages': [
+ {
+ 'package': 'gn/gn/mac-${{arch}}',
+ 'version': Var('gn_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ 'condition': 'host_os == "mac"',
+ },
+
+ 'src/buildtools/win': {
+ 'packages': [
+ {
+ 'package': 'gn/gn/windows-amd64',
+ 'version': Var('gn_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ 'condition': 'host_os == "win"',
+ },
+
+ 'src/buildtools/reclient': {
+ 'packages': [
+ {
+ 'package': 'infra/rbe/client/${{platform}}',
+ 'version': Var('reclient_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/catapult':
+ Var('chromium_git') + '/catapult.git' + '@' + 'fa05d995e152efdae488a2aeba397cd609fdbc9d',
+ 'src/third_party/clang-format/script':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef',
+ 'src/third_party/colorama/src':
+ Var('chromium_git') + '/external/colorama.git' + '@' + '3de9f013df4b470069d03d250224062e8cf15c49',
+ 'src/third_party/cpu_features/src': {
+ 'url': Var('chromium_git') + '/external/github.com/google/cpu_features.git' + '@' + '936b9ab5515dead115606559502e3864958f7f6e',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/depot_tools':
+ Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + 'd3e43dd4319ba169c0aaf44547eecf861f2fe5da',
+ 'src/third_party/freetype/src':
+ Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '9e3c5d7e183c1a8d5ed8868d7d28ef18d3ec9ec8',
+ 'third_party/fuchsia-gn-sdk': {
+ 'url': Var('chromium_git') + '/chromium/src/third_party/fuchsia-gn-sdk.git' + '@' + '0d6902558d92fe3d49ba9a8f638ddea829be595b',
+ 'condition': 'checkout_fuchsia',
+ },
+ 'src/third_party/googletest/src':
+ Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'af29db7ec28d6df1c7f0f745186884091e602e07',
+ 'src/third_party/harfbuzz-ng/src':
+ Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'db700b5670d9475cc8ed4880cc9447b232c5e432',
+ 'src/third_party/libc++/src':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '84fb809dd6dae36d556dc0bb702c6cc2ce9d4b80',
+ 'src/third_party/libc++abi/src':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '8d21803b9076b16d46c32e2f10da191ee758520c',
+ 'src/third_party/libunwind/src':
+ Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f1c687e0aaf0d70b9a53a150e9be5cb63af9215f',
+ 'src/third_party/libjpeg_turbo':
+ Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '30bdb85e302ecfc52593636b2f44af438e05e784',
+ 'src/third_party/nasm':
+ Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '7fc833e889d1afda72c06220e5bed8fb43b2e5ce',
+ 'src/tools':
+ Var('chromium_git') + '/chromium/src/tools' + '@' + 'a76c0dbb64c603a0d45e0c6dfae3a351b6e1adf1',
+
+ # libyuv-only dependencies (not present in Chromium).
+ 'src/third_party/gtest-parallel':
+ Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
+
+ 'src/third_party/lss': {
+ 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + 'ce877209e11aa69dcfffbd53ef90ea1d07136521',
+ 'condition': 'checkout_android or checkout_linux',
+ },
+
+ # Android deps:
+ 'src/third_party/accessibility_test_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/accessibility-test-framework',
+ 'version': 'b5ec1e56e58e56bc1a0c77d43111c37f9b512c8a',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/kotlin_stdlib': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/kotlin_stdlib',
+ 'version': 'Z1gsqhL967kFQecxKrRwXHbl-vwQjpv0l7PMUZ0EVO8C',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/kotlinc/current': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/kotlinc',
+ 'version': 'Rr02Gf2EkaeSs3EhSUHhPqDHSd1AzimrM6cRYUJCPjQC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/boringssl/src':
+ 'https://boringssl.googlesource.com/boringssl.git' + '@' + '20a06474c0b4a16779311bfe98ba69dc2402101d',
+ 'src/base': {
+ 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'd407b7061bce341bb6e11b539ea86c46c949ac4c',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/bazel': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/bazel',
+ 'version': 'VjMsf48QUWw8n7XtJP2AuSjIGmbQeYdWdwyxVvIRLmAC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/bouncycastle': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/bouncycastle',
+ 'version': 'c078e87552ba26e776566fdaf0f22cd8712743d0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_toolchain': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_toolchain/android_toolchain',
+ 'version': 'R_8suM8m0oHbZ1awdxGXvKEFpAOETscbfZxkkMthyk8C',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/androidx': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/androidx',
+ 'version': 'y7rF_rx56mD3FGhMiqnlbQ6HOqHJ95xUFNX1m-_a988C',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_support_test_runner': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_support_test_runner',
+ 'version': '96d4bf848cd210fdcbca6bcc8c1b4b39cbd93141',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_sdk/public': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_sdk/public/build-tools/34.0.0',
+ 'version': 'YK9Rzw3fDzMHVzatNN6VlyoD_81amLZpN1AbmkdOd6AC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/emulator',
+ 'version': '9lGp8nTUCRRWGMnI_96HcKfzjnxEJKUcfvfwmA3wXNkC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/patcher',
+ 'version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platform-tools',
+ 'version': 'HWVsGs2HCKgSVv41FsOcsfJbNcB0UFiNrF6Tc4yRArYC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platforms/android-34',
+ 'version': 'u-bhWbTME6u-DjypTgr3ZikCyeAeU6txkR9ET6Uudc8C',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platforms/android-tiramisuprivacysandbox',
+ 'version': 'YWMYkzyxGBgVsty0GhXL1oxbY0pGXQIgFc0Rh7ZMRPYC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/sources/android-31',
+ 'version': '_a_BcnANjPYw5mSKlNHa7GFY8yc1kdqj2rmQgac7yUcC',
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/cmdline-tools',
+ 'version': 'EWnL2r7oV5GtE9Ef7GyohyFam42wtMtEKYU4dCb3U1YC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/tools/clang/dsymutil': {
+ 'packages': [
+ {
+ 'package': 'chromium/llvm-build-tools/dsymutil',
+ 'version': 'OWlhXkmj18li3yhJk59Kmjbc5KdgLh56TwCd1qBdzlIC',
+ }
+ ],
+ 'condition': 'checkout_mac',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_build_tools/aapt2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_build_tools/aapt2',
+ 'version': 'STY0BXlZxsEhudnlXQFed-B5UpwehcoM0sYqor6qRqsC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/byte_buddy': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/byte_buddy',
+ 'version': 'c9b53316603fc2d997c899c7ca1707f809b918cd',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/byte_buddy/android_sdk_build_tools_25_0_2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_sdk/public/build-tools',
+ 'version': 'kwIs2vdfTm93yEP8LG5aSnchN4BVEdVxbqQtF4XpPdkC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/ced/src': {
+ 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/errorprone/lib': {
+ 'url': Var('chromium_git') + '/chromium/third_party/errorprone.git' + '@' + '980d49e839aa4984015efed34b0134d4b2c9b6d7',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/findbugs': {
+ 'url': Var('chromium_git') + '/chromium/deps/findbugs.git' + '@' + '4275d9ac8610db6b1bc9a5e887f97e41b33fac67',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/gson': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/gson',
+ 'version': '681931c9778045903a0ed59856ce2dd8dd7bf7ca',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/guava': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/guava',
+ 'version': 'a6fba501f3a0de88b9be1daa2052632de5b96a46',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/hamcrest': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/hamcrest',
+ 'version': '37eccfc658fe79695d6abb6dd497463c4372032f',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/icu': {
+ 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'e8c3bc9ea97d4423ad0515e5f1c064f486dae8b1',
+ },
+ 'src/third_party/icu4j': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/icu4j',
+ 'version': 'e87e5bed2b4935913ee26a3ebd0b723ee2344354',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/intellij': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/intellij',
+ 'version': '77c2721b024b36ee073402c08e6d8428c0295336',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/jdk': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/jdk',
+ 'version': 'GCFtf5t6M4HlrHj6NXedHbpHp2xjgognF8ptNci4478C',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/jsr-305/src': {
+ 'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/junit/src': {
+ 'url': Var('chromium_git') + '/external/junit.git' + '@' + '05fe2a64f59127c02135be22f416e91260d6ede6',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/libunwindstack': {
+ 'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '4dbfa0e8c844c8e243b297bc185e54a99ff94f9e',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/ninja': {
+ 'packages': [
+ {
+ 'package': 'infra/3pp/tools/ninja/${{platform}}',
+ 'version': Var('ninja_version'),
+ }
+ ],
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/mockito/src': {
+ 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '7c3641bcef717ffa7d765f2c86b847d0aab1aac9',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/objenesis': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/objenesis',
+ 'version': 'tknDblENYi8IaJYyD6tUahUyHYZlzJ_Y74_QZSz4DpIC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/ow2_asm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/ow2_asm',
+ 'version': 'NNAhdJzMdnutUVqfSJm5v0tVazA9l3Dd6CRwH6N4Q5kC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/r8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/r8',
+ 'version': 'O1BBWiBTIeNUcraX8STMtQXVaCleu6SJJjWCcnfhPLkC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ # This duplication is intentional, so we avoid updating the r8.jar used by
+ # dexing unless necessary, since each update invalidates all incremental
+ # dexing and unnecessarily slows down all bots.
+ 'src/third_party/r8/d8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/r8',
+ 'version': 'vw5kLlW3-suSlCKSO9OQpFWpR8oDnvQ8k1RgKNUapQYC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/proguard': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/proguard',
+ 'version': 'Fd91BJFVlmiO6c46YMTsdy7n2f5Sk2hVVGlzPLvqZPsC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/requests/src': {
+ 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'c7e0fc087ceeadb8b4c84a0953a422c474093d6d',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/robolectric': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/robolectric',
+ 'version': 'hzetqh1qFI32FOgQroZvGcGdomrgVBJ6WKRnl1KFw6EC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/sqlite4java': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/sqlite4java',
+ 'version': 'LofjKH9dgXIAJhRYCPQlMFywSwxYimrfDeBmaHc-Z5EC',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/turbine': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/turbine',
+ 'version': '2I2Nz480QsuCxpQ1lMfbigX8l5HAhX3_ykWU4TKRGo4C',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/ub-uiautomator/lib': {
+ 'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
+ 'condition': 'checkout_android',
+ },
+
+ # iOS deps:
+ 'src/ios': {
+ 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + 'ddd58e86cf4ebdc0db60a5d0f3c323de49bb295c',
+ 'condition': 'checkout_ios'
+ },
+
+ # Everything coming after this is automatically updated by the auto-roller.
+ # === ANDROID_DEPS Generated Code Start ===
+ # Generated by //third_party/android_deps/fetch_all.py
+ 'src/third_party/android_deps/libs/android_arch_core_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_core_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
+ 'version': 'version:2@1.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_collections': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_customview': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_design': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_documentfile': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_interpolator': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_loader': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_multidex': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
+ 'version': 'version:2@1.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_print': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_fragment': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_v4': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_transition': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_viewpager': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
+ 'version': 'version:2@28.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_common',
+ 'version': 'version:2@30.2.0-beta01.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api',
+ 'version': 'version:2@30.2.0-beta01.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_tools_sdk_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common',
+ 'version': 'version:2@30.2.0-beta01.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
+ 'version': 'version:2@2.8.8.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
+ 'version': 'version:2@1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_annotations',
+ 'version': 'version:2@4.1.1.4.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework',
+ 'version': 'version:2@4.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_datatransport_transport_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_datatransport_transport_api',
+ 'version': 'version:2@2.2.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
+ 'version': 'version:2@20.1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
+ 'version': 'version:2@18.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
+ 'version': 'version:2@18.0.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
+ 'version': 'version:2@18.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
+ 'version': 'version:2@18.1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging',
+ 'version': 'version:2@16.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
+ 'version': 'version:2@18.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
+ 'version': 'version:2@19.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
+ 'version': 'version:2@18.0.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
+ 'version': 'version:2@20.1.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
+ 'version': 'version:2@19.1.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_material_material': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material',
+ 'version': 'version:2@1.7.0-alpha02.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_play_core_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core_common',
+ 'version': 'version:2@2.0.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_play_feature_delivery': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_feature_delivery',
+ 'version': 'version:2@2.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_auto_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common',
+ 'version': 'version:2@1.2.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_service_auto_service': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service',
+ 'version': 'version:2@1.0-rc6.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations',
+ 'version': 'version:2@1.0-rc6.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations',
+ 'version': 'version:2@1.10.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
+ 'version': 'version:2@3.0.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_code_gson_gson': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson',
+ 'version': 'version:2@2.9.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
+ 'version': 'version:2@2.30.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
+ 'version': 'version:2@2.30.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
+ 'version': 'version:2@2.30.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
+ 'version': 'version:2@2.30.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation',
+ 'version': 'version:2@2.11.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
+ 'version': 'version:2@2.18.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api',
+ 'version': 'version:2@2.11.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core',
+ 'version': 'version:2@2.11.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations',
+ 'version': 'version:2@2.11.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_javac': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac',
+ 'version': 'version:2@9+181-r4173-1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
+ 'version': 'version:2@9-dev-r4023-3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_annotations',
+ 'version': 'version:2@16.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_common',
+ 'version': 'version:2@19.5.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_components': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_components',
+ 'version': 'version:2@16.1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders',
+ 'version': 'version:2@16.1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json',
+ 'version': 'version:2@17.1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_iid': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid',
+ 'version': 'version:2@21.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop',
+ 'version': 'version:2@17.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_installations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations',
+ 'version': 'version:2@16.3.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop',
+ 'version': 'version:2@16.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector',
+ 'version': 'version:2@18.0.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_firebase_firebase_messaging': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_messaging',
+ 'version': 'version:2@21.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
+ 'version': 'version:2@1.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_guava_failureaccess': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess',
+ 'version': 'version:2@1.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_guava_guava': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
+ 'version': 'version:2@31.1-jre.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_guava_guava_android': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava_android',
+ 'version': 'version:2@31.1-android.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_guava_listenablefuture': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture',
+ 'version': 'version:2@1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
+ 'version': 'version:2@1.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java',
+ 'version': 'version:2@3.19.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite',
+ 'version': 'version:2@3.21.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils',
+ 'version': 'version:2@1.3.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_squareup_javapoet': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
+ 'version': 'version:2@1.13.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_squareup_javawriter': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter',
+ 'version': 'version:2@2.1.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_squareup_okio_okio_jvm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_okio_okio_jvm',
+ 'version': 'version:2@3.3.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm',
+ 'version': 'version:2@4.7.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils',
+ 'version': 'version:2@4.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_grpc_grpc_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_api',
+ 'version': 'version:2@1.49.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_grpc_grpc_binder': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_binder',
+ 'version': 'version:2@1.49.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_grpc_grpc_context': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_context',
+ 'version': 'version:2@1.49.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_grpc_grpc_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_core',
+ 'version': 'version:2@1.49.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite',
+ 'version': 'version:2@1.49.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_grpc_grpc_stub': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_stub',
+ 'version': 'version:2@1.49.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/io_perfmark_perfmark_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/io_perfmark_perfmark_api',
+ 'version': 'version:2@0.25.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
+ 'version': 'version:2@1.3.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+ 'version': 'version:2@1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/javax_inject_javax_inject': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+ 'version': 'version:2@1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy',
+ 'version': 'version:2@1.14.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent',
+ 'version': 'version:2@1.14.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
+ 'version': 'version:2@0.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on',
+ 'version': 'version:2@1.72.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
+ 'version': 'version:2@1.2.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+ 'version': 'version:2@2.5.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
+ 'version': 'version:2@3.25.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_checker_util': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_util',
+ 'version': 'version:2@3.25.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone',
+ 'version': 'version:2@3.15.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+ 'version': 'version:2@1.21.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber',
+ 'version': 'version:2@2.5.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit',
+ 'version': 'version:2@4.4.1.201607150455-r.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_hamcrest_hamcrest': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_hamcrest_hamcrest',
+ 'version': 'version:2@2.2.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7',
+ 'version': 'version:2@1.8.20.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8',
+ 'version': 'version:2@1.8.20.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android',
+ 'version': 'version:2@1.6.4.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm',
+ 'version': 'version:2@1.6.4.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava',
+ 'version': 'version:2@1.6.4.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
+ 'version': 'version:2@0.1.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_jsoup_jsoup': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_jsoup_jsoup',
+ 'version': 'version:2@1.15.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_mockito_mockito_android': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_android',
+ 'version': 'version:2@5.4.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_mockito_mockito_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_core',
+ 'version': 'version:2@5.4.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_mockito_mockito_subclass': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_subclass',
+ 'version': 'version:2@5.4.0.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_objenesis_objenesis': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_objenesis_objenesis',
+ 'version': 'version:2@3.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm',
+ 'version': 'version:2@9.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis',
+ 'version': 'version:2@9.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons',
+ 'version': 'version:2@9.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree',
+ 'version': 'version:2@9.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_ow2_asm_asm_util': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util',
+ 'version': 'version:2@9.5.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_pcollections_pcollections': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections',
+ 'version': 'version:2@3.1.4.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_junit': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_nativeruntime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat',
+ 'version': 'version:2@1.0.1.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_pluginapi': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_resources': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_robolectric': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_sandbox': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadowapi': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadows_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/org_robolectric_utils_reflector': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector',
+ 'version': 'version:2@4.10.3.cr1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ # === ANDROID_DEPS Generated Code End ===
+}
+
+pre_deps_hooks = [
+ {
+ # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8.
+ # TODO(kjellander): Remove this in March 2017.
+ 'name': 'cleanup_links',
+ 'pattern': '.',
+ 'action': ['python3', 'src/cleanup_links.py'],
+ },
+]
+
+hooks = [
+ {
+ # This clobbers when necessary (based on get_landmines.py). It should be
+ # an early hook but it will need to be run after syncing Chromium and
+ # setting up the links, so the script actually exists.
+ 'name': 'landmines',
+ 'pattern': '.',
+ 'action': [
+ 'python3',
+ 'src/build/landmines.py',
+ '--landmine-scripts',
+ 'src/tools_libyuv/get_landmines.py',
+ '--src-dir',
+ 'src',
+ ],
+ },
+ # Downloads the current stable linux sysroot to build/linux/ if needed.
+ {
+ 'name': 'sysroot_arm',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_arm',
+ 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=arm'],
+ },
+ {
+ 'name': 'sysroot_arm64',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_arm64',
+ 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=arm64'],
+ },
+ {
+ 'name': 'sysroot_x86',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and (checkout_x86 or checkout_x64)',
+ 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=x86'],
+ },
+ {
+ 'name': 'sysroot_mips',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_mips',
+ 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=mips'],
+ },
+ {
+ 'name': 'sysroot_x64',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_x64',
+ 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=x64'],
+ },
+ {
+ # Update the Windows toolchain if necessary.
+ 'name': 'win_toolchain',
+ 'pattern': '.',
+ 'action': ['python3', 'src/build/vs_toolchain.py', 'update'],
+ },
+ {
+ # Update the Mac toolchain if necessary.
+ 'name': 'mac_toolchain',
+ 'pattern': '.',
+ 'action': ['python3', 'src/build/mac_toolchain.py'],
+ 'condition': 'checkout_mac',
+ },
+ {
+ 'name': 'msan_chained_origins_focal',
+ 'pattern': '.',
+ 'condition': 'checkout_instrumented_libraries',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-instrumented-libraries',
+ '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
+ ],
+ },
+ {
+ 'name': 'msan_no_origins_focal',
+ 'pattern': '.',
+ 'condition': 'checkout_instrumented_libraries',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-instrumented-libraries',
+ '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
+ ],
+ },
+ {
+ 'name': 'msan_chained_origins_focal',
+ 'pattern': '.',
+ 'condition': 'checkout_instrumented_libraries',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-instrumented-libraries',
+ '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
+ ],
+ },
+ {
+ 'name': 'msan_no_origins_focal',
+ 'pattern': '.',
+ 'condition': 'checkout_instrumented_libraries',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-instrumented-libraries',
+ '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
+ ],
+ },
+ {
+ 'name': 'Download Fuchsia SDK from GCS',
+ 'pattern': '.',
+ 'condition': 'checkout_fuchsia',
+ 'action': [
+ 'python3',
+ 'src/build/fuchsia/update_sdk.py',
+ '--cipd-prefix={fuchsia_sdk_cipd_prefix}',
+ '--version={fuchsia_version}',
+ ],
+ },
+ {
+ 'name': 'Download Fuchsia system images',
+ 'pattern': '.',
+ 'condition': 'checkout_fuchsia and checkout_fuchsia_product_bundles',
+ 'action': [
+ 'python3',
+ 'src/build/fuchsia/update_product_bundles.py',
+ '{checkout_fuchsia_boot_images}',
+ ],
+ },
+ {
+ # Pull clang if needed or requested via GYP_DEFINES.
+ # Note: On Win, this should run after win_toolchain, as it may use it.
+ 'name': 'clang',
+ 'pattern': '.',
+ 'action': ['python3', 'src/tools/clang/scripts/update.py'],
+ },
+ {
+ # Update LASTCHANGE.
+ 'name': 'lastchange',
+ 'pattern': '.',
+ 'action': ['python3', 'src/build/util/lastchange.py',
+ '-o', 'src/build/util/LASTCHANGE'],
+ },
+ # Pull clang-format binaries using checked-in hashes.
+ {
+ 'name': 'clang_format_win',
+ 'pattern': '.',
+ 'condition': 'host_os == "win"',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/win/clang-format.exe.sha1',
+ ],
+ },
+ {
+ 'name': 'clang_format_mac_x64',
+ 'pattern': '.',
+ 'condition': 'host_os == "mac" and host_cpu == "x64"',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/mac/clang-format.x64.sha1',
+ '-o', 'src/buildtools/mac/clang-format',
+ ],
+ },
+ {
+ 'name': 'clang_format_mac_arm64',
+ 'pattern': '.',
+ 'condition': 'host_os == "mac" and host_cpu == "arm64"',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/mac/clang-format.arm64.sha1',
+ '-o', 'src/buildtools/mac/clang-format',
+ ],
+ },
+ {
+ 'name': 'clang_format_linux',
+ 'pattern': '.',
+ 'condition': 'host_os == "linux"',
+ 'action': [ 'python3',
+ 'src/third_party/depot_tools/download_from_google_storage.py',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/linux64/clang-format.sha1',
+ ],
+ },
+ # Pull luci-go binaries (isolate, swarming) using checked-in hashes.
+ {
+ 'name': 'luci-go_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', 'src/tools/luci-go/win64',
+ ],
+ },
+ {
+ 'name': 'luci-go_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', 'src/tools/luci-go/mac64',
+ ],
+ },
+ {
+ 'name': 'luci-go_linux',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', 'src/tools/luci-go/linux64',
+ ],
+ },
+ {
+ 'name': 'Generate component metadata for tests',
+ 'pattern': '.',
+ 'action': [
+ 'vpython3',
+ 'src/testing/generate_location_tags.py',
+ '--out',
+ 'src/testing/location_tags.json',
+ ],
+ },
+]
+
+recursedeps = []
diff --git a/DIR_METADATA b/DIR_METADATA
new file mode 100644
index 00000000..8bc04f15
--- /dev/null
+++ b/DIR_METADATA
@@ -0,0 +1,3 @@
+monorail {
+ component: "Internals>Images>Codecs"
+}
diff --git a/LICENSE b/LICENSE
index da40b336..c911747a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2011, Google Inc. All rights reserved.
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
diff --git a/METADATA b/METADATA
index d97975ca..19d0436e 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,19 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update libyuv
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
+
+name: "libyuv"
+description: "libyuv is an open source project that includes YUV scaling and conversion functionality."
third_party {
license_type: NOTICE
+ last_upgrade_date {
+ year: 2024
+ month: 1
+ day: 11
+ }
+ identifier {
+ type: "Git"
+ value: "https://chromium.googlesource.com/libyuv/libyuv/"
+ version: "af6ac8265bbd07bcf977526458b60305c4304288"
+ }
}
diff --git a/OWNERS b/OWNERS
index a607e727..f11a7bfd 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,4 +1,11 @@
-fbarchard@google.com
-phoglund@google.com
-magjed@google.com
-chz@google.com
+mbonadei@chromium.org
+fbarchard@chromium.org
+magjed@chromium.org
+wtc@google.com
+jansson@google.com
+
+per-file *.gn=mbonadei@chromium.org,jansson@google.com
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=mbonadei@chromium.org,jansson@google.com
diff --git a/OWNERS.android b/OWNERS.android
new file mode 100644
index 00000000..7529cb92
--- /dev/null
+++ b/OWNERS.android
@@ -0,0 +1 @@
+include platform/system/core:/janitors/OWNERS
diff --git a/files/PATENTS b/PATENTS
index 64aa5c90..64aa5c90 100644
--- a/files/PATENTS
+++ b/PATENTS
diff --git a/files/PRESUBMIT.py b/PRESUBMIT.py
index 2cf1542f..d3901caf 100755..100644
--- a/files/PRESUBMIT.py
+++ b/PRESUBMIT.py
@@ -6,50 +6,30 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-import os
-
-
-def _RunPythonTests(input_api, output_api):
- def join(*args):
- return input_api.os_path.join(input_api.PresubmitLocalPath(), *args)
-
- test_directories = [
- root for root, _, files in os.walk(join('tools_libyuv'))
- if any(f.endswith('_test.py') for f in files)
- ]
-
- tests = []
- for directory in test_directories:
- tests.extend(
- input_api.canned_checks.GetUnitTestsInDirectory(
- input_api,
- output_api,
- directory,
- whitelist=[r'.+_test\.py$']))
- return input_api.RunTests(tests, parallel=True)
-
+# Runs PRESUBMIT.py in py3 mode by git cl presubmit.
+USE_PYTHON3 = True
def _CommonChecks(input_api, output_api):
"""Checks common to both upload and commit."""
results = []
results.extend(input_api.canned_checks.RunPylint(input_api, output_api,
- black_list=(r'^base[\\\/].*\.py$',
- r'^build[\\\/].*\.py$',
- r'^buildtools[\\\/].*\.py$',
- r'^ios[\\\/].*\.py$',
- r'^out.*[\\\/].*\.py$',
- r'^testing[\\\/].*\.py$',
- r'^third_party[\\\/].*\.py$',
- r'^tools[\\\/].*\.py$',
- # TODO(kjellander): should arguably be checked.
- r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
- r'^xcodebuild.*[\\\/].*\.py$',),
+ files_to_skip=(r'^base[\\\/].*\.py$',
+ r'^build[\\\/].*\.py$',
+ r'^buildtools[\\\/].*\.py$',
+ r'^ios[\\\/].*\.py$',
+ r'^out.*[\\\/].*\.py$',
+ r'^testing[\\\/].*\.py$',
+ r'^third_party[\\\/].*\.py$',
+ r'^tools[\\\/].*\.py$',
+ # TODO(kjellander): should arguably be checked.
+ r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
+ r'^xcodebuild.*[\\\/].*\.py$',),
disabled_warnings=['F0401', # Failed to import x
'E0611', # No package y in x
'W0232', # Class has no __init__ method
],
- pylintrc='pylintrc'))
- results.extend(_RunPythonTests(input_api, output_api))
+ pylintrc='pylintrc',
+ version='2.7'))
return results
diff --git a/files/README.chromium b/README.chromium
index bddc2023..1389f285 100644
--- a/files/README.chromium
+++ b/README.chromium
@@ -1,8 +1,10 @@
Name: libyuv
-URL: http://code.google.com/p/libyuv/
-Version: 1732
+URL: https://chromium.googlesource.com/libyuv/libyuv/
+Version: 1883
License: BSD
License File: LICENSE
+Shipped: yes
Description:
libyuv is an open source project that includes YUV conversion and scaling functionality.
+
diff --git a/files/README.md b/README.md
index db70b7f0..95eeb04c 100644
--- a/files/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
* Optimized for SSSE3/AVX2 on x86/x64.
* Optimized for Neon on Arm.
* Optimized for MSA on Mips.
+* Optimized for RVV on RISC-V.
### Development
diff --git a/README.version b/README.version
deleted file mode 100644
index 0e74ad15..00000000
--- a/README.version
+++ /dev/null
@@ -1,3 +0,0 @@
-Version: r1732
-BugComponent: 42195
-Owner: lajos
diff --git a/files/build_overrides/build.gni b/build_overrides/build.gni
index 6d8319b9..d9d01d51 100644
--- a/files/build_overrides/build.gni
+++ b/build_overrides/build.gni
@@ -6,9 +6,6 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-# Some non-Chromium builds don't use Chromium's third_party/binutils.
-linux_use_bundled_binutils_override = true
-
# Variable that can be used to support multiple build scenarios, like having
# Chromium specific targets in a client project's GN file etc.
build_with_chromium = false
@@ -16,6 +13,9 @@ build_with_chromium = false
# Some non-Chromium builds don't support building java targets.
enable_java_templates = true
+# Enables assertions on safety checks in libc++.
+enable_safe_libcxx = true
+
# Allow using custom suppressions files (currently not used by libyuv).
asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
@@ -44,3 +44,20 @@ if (host_os == "mac") {
"hermetic toolchain if the minimum OS version is not met.")
use_system_xcode = _result == 0
}
+
+declare_args() {
+ # Tracing support requires //third_party/perfetto.
+ enable_base_tracing = false
+ use_perfetto_client_library = false
+
+ # Limits the defined //third_party/android_deps targets to only "buildCompile"
+ # and "buildCompileNoDeps" targets. This is useful for third-party
+ # repositories which do not use JUnit tests. For instance,
+ # limit_android_deps == true removes "gn gen" requirement for
+ # //third_party/robolectric .
+ limit_android_deps = false
+
+ # Allows googletest to pretty-print various absl types.
+ # Defined here rather than in gtest.gni to match chromium.
+ gtest_enable_absl_printers = true
+}
diff --git a/files/build_overrides/gtest.gni b/build_overrides/gtest.gni
index d3c3f68c..d3c3f68c 100644
--- a/files/build_overrides/gtest.gni
+++ b/build_overrides/gtest.gni
diff --git a/build_overrides/partition_alloc.gni b/build_overrides/partition_alloc.gni
new file mode 100644
index 00000000..dcf8ac2d
--- /dev/null
+++ b/build_overrides/partition_alloc.gni
@@ -0,0 +1,17 @@
+# Copyright 2022 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Use default values for PartitionAlloc as standalone library from
+# base/allocator/partition_allocator/build_overrides/partition_alloc.gni
+use_partition_alloc_as_malloc_default = false
+use_allocator_shim_default = false
+enable_backup_ref_ptr_support_default = false
+enable_mte_checked_ptr_support_default = false
+put_ref_count_in_previous_slot_default = false
+enable_backup_ref_ptr_slow_checks_default = false
+enable_dangling_raw_ptr_checks_default = false
diff --git a/files/cleanup_links.py b/cleanup_links.py
index ba290789..7d1eba9b 100755
--- a/files/cleanup_links.py
+++ b/cleanup_links.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env vpython3
+
# Copyright 2017 The LibYuv Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style license
@@ -18,8 +19,8 @@ landing that change, this script cleans up any old symlinks, avoiding annoying
manual cleanup needed in order to complete gclient sync.
"""
+import argparse
import logging
-import optparse
import os
import shelve
import subprocess
@@ -32,14 +33,14 @@ LINKS_DB = 'links'
# Version management to make future upgrades/downgrades easier to support.
SCHEMA_VERSION = 1
-class WebRTCLinkSetup(object):
+class WebRTCLinkSetup():
def __init__(self, links_db, dry_run=False):
self._dry_run = dry_run
self._links_db = links_db
def CleanupLinks(self):
logging.debug('CleanupLinks')
- for source, link_path in self._links_db.iteritems():
+ for source, link_path in self._links_db.tems():
if source == 'SCHEMA_VERSION':
continue
if os.path.islink(link_path) or sys.platform.startswith('win'):
@@ -71,15 +72,15 @@ def _initialize_database(filename):
def main():
- parser = optparse.OptionParser()
- parser.add_option('-d', '--dry-run', action='store_true', default=False,
- help='Print what would be done, but don\'t perform any '
- 'operations. This will automatically set logging to '
- 'verbose.')
- parser.add_option('-v', '--verbose', action='store_const',
- const=logging.DEBUG, default=logging.INFO,
- help='Print verbose output for debugging.')
- options, _ = parser.parse_args()
+ p = argparse.ArgumentParser()
+ p.add_argument('-d', '--dry-run', action='store_true', default=False,
+ help='Print what would be done, but don\'t perform any '
+ 'operations. This will automatically set logging to '
+ 'verbose.')
+ p.add_argument('-v', '--verbose', action='store_const',
+ const=logging.DEBUG, default=logging.INFO,
+ help='Print verbose output for debugging.')
+ options = p.parse_args()
if options.dry_run:
options.verbose = logging.DEBUG
diff --git a/codereview.settings b/codereview.settings
index 9782886f..b226fae5 100644
--- a/codereview.settings
+++ b/codereview.settings
@@ -1,5 +1,5 @@
-# This file is used by git cl to get repository specific information.
+# This file is used by `git cl` to get repository specific information.
+CODE_REVIEW_SERVER: codereview.chromium.org
GERRIT_HOST: True
PROJECT: libyuv
-TRY_ON_UPLOAD: False
VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/files/docs/deprecated_builds.md b/docs/deprecated_builds.md
index 29e0bf9b..8edefd78 100644
--- a/files/docs/deprecated_builds.md
+++ b/docs/deprecated_builds.md
@@ -165,11 +165,11 @@ mipsel
arm32 disassembly:
- third_party/android_ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+ llvm-objdump -d out/Release/obj/source/libyuv.row_neon.o
arm64 disassembly:
- third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+ llvm-objdump -d out/Release/obj/source/libyuv.row_neon64.o
Running tests:
@@ -239,6 +239,7 @@ If you get a compile error for atlthunk.lib on Windows, read http://www.chromium
ninja -C out/Debug libyuv_unittest
ninja -C out/Debug compare
ninja -C out/Debug yuvconvert
+ ninja -C out/Debug yuvconstants
ninja -C out/Debug psnr
ninja -C out/Debug cpuid
diff --git a/files/docs/environment_variables.md b/docs/environment_variables.md
index cd8159ad..4eb09659 100644
--- a/files/docs/environment_variables.md
+++ b/docs/environment_variables.md
@@ -22,6 +22,7 @@ By default the cpu is detected and the most advanced form of SIMD is used. But
LIBYUV_DISABLE_F16C
LIBYUV_DISABLE_AVX512BW
LIBYUV_DISABLE_AVX512VL
+ LIBYUV_DISABLE_AVX512VNNI
LIBYUV_DISABLE_AVX512VBMI
LIBYUV_DISABLE_AVX512VBMI2
LIBYUV_DISABLE_AVX512VBITALG
@@ -34,7 +35,13 @@ By default the cpu is detected and the most advanced form of SIMD is used. But
## MIPS CPUs
LIBYUV_DISABLE_MSA
- LIBYUV_DISABLE_MMI
+
+## LOONGARCH CPUs
+ LIBYUV_DISABLE_LSX
+ LIBYUV_DISABLE_LASX
+
+## RISCV CPUs
+ LIBYUV_DISABLE_RVV
# Test Width/Height/Repeat
diff --git a/files/docs/filtering.md b/docs/filtering.md
index 8696976e..8696976e 100644
--- a/files/docs/filtering.md
+++ b/docs/filtering.md
diff --git a/files/docs/formats.md b/docs/formats.md
index 97e8ce05..12ea9465 100644
--- a/files/docs/formats.md
+++ b/docs/formats.md
@@ -4,7 +4,9 @@ Formats (FOURCC) supported by libyuv are detailed here.
# Core Formats
-There are 2 core formats supported by libyuv - I420 and ARGB. All YUV formats can be converted to/from I420. All RGB formats can be converted to/from ARGB.
+There are 2 core formats supported by libyuv - I420 and ARGB.
+ All YUV formats can be converted to/from I420.
+ All RGB formats can be converted to/from ARGB.
Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
@@ -36,7 +38,7 @@ This is how OSX formats map to libyuv
The following is extracted from video_common.h as a complete list of formats supported by libyuv.
enum FourCC {
- // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -46,16 +48,20 @@ The following is extracted from video_common.h as a complete list of formats sup
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
+ FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020, unofficial fourcc.
+ // 10 bit lsb
// 1 Secondary YUV format: row biplanar.
- FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'), // deprecated.
- // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
+ // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc, 2 64 bpp
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010.
FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit
+ FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel.
+ FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
@@ -66,7 +72,7 @@ The following is extracted from video_common.h as a complete list of formats sup
// 1 Primary Compressed YUV format.
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ // 11 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -75,6 +81,9 @@ The following is extracted from video_common.h as a complete list of formats sup
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc
+ FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc
+ FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc
+ FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -104,6 +113,27 @@ The following is extracted from video_common.h as a complete list of formats sup
I444, NV24 and NV42 are full width, full height
I400 and J400 have no chroma channel.
+# Color space
+ The YUV formats start with a letter to specify the color space. e.g. I420
+ I = BT.601 limited range
+ J = BT.601 full range (J = JPEG that uses this)
+ H = BT.709 limited range (H for HD)
+ F = BT.709 full range (F for Full range)
+ U = BT.2020 limited range (U for UHD)
+ V = BT.2020 full range
+ For YUV to RGB conversions, a matrix can be passed. See also convert_argh.h
+
+# HDR formats
+ Planar formats with 10 or 12 bits use the following fourcc:
+ I010, I012, P010, P012 are half width, half height
+ I210, I212, P210, P212 are half width, full height
+ I410, I412, P410, P412 are full width, full height
+ where
+ I is the color space (see above) and 3 planes: Y, U and V.
+ P is a biplanar format, similar to NV12 but 16 bits, with the valid bits in the high bits. There is a Y plane and a UV plane.
+ 0, 2 or 4 is the last digit of subsampling: 4:2:0, 4:2:2, or 4:4:4
+ 10 or 12 is the bits per channel. The bits are in the low bits of a 16 bit channel.
+
# The ARGB FOURCC
There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers.
@@ -152,6 +182,13 @@ The 2 bit alpha has 4 values. Here are the comparable 8 bit alpha values.
The 10 bit RGB values range from 0 to 1023.
XR30 is the same as AR30 but with no alpha channel.
+# AB64 and AR64
+
+AB64 is similar to ABGR, with 16 bit (2 bytes) per channel. Each channel stores an unsigned short.
+In memory R is the lowest and A is the highest.
+Each channel has value ranges from 0 to 65535.
+AR64 is similar to ARGB.
+
# NV12 and NV21
NV12 is a biplanar format with a full sized Y plane followed by a single
@@ -161,3 +198,11 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
+Most NV12 functions allow the destination Y pointer to be NULL.
+
+# YUY2 and UYVY
+
+YUY2 is a packed YUV format with half width, full height.
+
+YUY2 is YUYV in memory
+UYVY is UYVY in memory
diff --git a/files/docs/getting_started.md b/docs/getting_started.md
index 4426b606..f2f71b8b 100644
--- a/files/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -139,11 +139,11 @@ mips
arm disassembly:
- third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
+ llvm-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
- third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
+ llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
- third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+ llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
@@ -165,6 +165,7 @@ Running test with C code:
ninja -C out/Debug libyuv_unittest
ninja -C out/Debug compare
ninja -C out/Debug yuvconvert
+ ninja -C out/Debug yuvconstants
ninja -C out/Debug psnr
ninja -C out/Debug cpuid
@@ -179,8 +180,8 @@ Running test with C code:
mips
- gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
- gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+ gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" is_component_build=false use_sysroot=false use_gold=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" is_component_build=false use_sysroot=false use_gold=false"
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
@@ -190,7 +191,7 @@ mips
make V=1 -f linux.mk
make V=1 -f linux.mk clean
- make V=1 -f linux.mk CXX=clang++
+ make V=1 -f linux.mk CXX=clang++ CC=clang
## Building the library with cmake
@@ -219,6 +220,47 @@ Install cmake: http://www.cmake.org/
make -j4
make package
+## Building RISC-V target with cmake
+
+### Prerequisite: build risc-v clang toolchain and qemu
+
+If you don't have prebuilt clang and riscv64 qemu, run the script to download source and build them.
+
+ ./riscv_script/prepare_toolchain_qemu.sh
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+### Cross-compile for RISC-V target
+ cmake -B out/Release/ -DUNIT_TEST=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+ -DTOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+ -DUSE_RVV=ON .
+ cmake --build out/Release/
+
+#### Customized Compiler Flags
+
+Customized compiler flags are supported by `-DRISCV_COMPILER_FLAGS="xxx"`.
+If `-DRISCV_COMPILER_FLAGS="xxx"` is manually assigned, other compile flags(e.g disable -march=xxx) will not be appended.
+
+Example:
+
+ cmake -B out/Release/ -DUNIT_TEST=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+ -DRISCV_COMPILER_FLAGS="-mcpu=sifive-x280" \
+ .
+
+### Run on QEMU
+
+#### Run libyuv_unittest on QEMU
+ cd out/Release/
+ USE_RVV=ON \
+ TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+ QEMU_PREFIX_PATH={QEMU_PREFIX_PATH} \
+ ../../riscv_script/run_qemu.sh libyuv_unittest
+
+
## Setup for Arm Cross compile
See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html
diff --git a/files/docs/rotation.md b/docs/rotation.md
index fb84fce5..a08430fd 100644
--- a/files/docs/rotation.md
+++ b/docs/rotation.md
@@ -100,4 +100,8 @@ Inverting can be achieved with almost any libyuv function by passing a negative
I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
+# Cropping - Vertical Flip
+When cropping from a subsampled format like NV21, the method of setting the start pointers wont work for odd crop start y on the UV plane.
+If the height after cropping will be odd, invert the source - point to the last row, negate the strides, and pass negative height, which
+will re-invert the image as the conversion outputs.
diff --git a/files/download_vs_toolchain.py b/download_vs_toolchain.py
index 4b345789..6bc086d6 100644
--- a/files/download_vs_toolchain.py
+++ b/download_vs_toolchain.py
@@ -1,5 +1,5 @@
-#!/usr/bin/env python
-#
+#!/usr/bin/env vpython3
+
# Copyright 2014 The LibYuv Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style license
@@ -22,7 +22,7 @@ sys.path.insert(0, os.path.join(checkout_root, 'build'))
sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
-import vs_toolchain
+import vs_toolchain # pylint: disable=wrong-import-position
if __name__ == '__main__':
diff --git a/files/.gitignore b/files/.gitignore
deleted file mode 100644
index 711f09e0..00000000
--- a/files/.gitignore
+++ /dev/null
@@ -1,94 +0,0 @@
-*.pyc
-pin-log.txt
-/base
-/build
-/buildtools
-/chromium/.gclient.tmp
-/chromium/.gclient.tmp_entries
-/chromium/.last_sync_chromium
-/chromium/src/
-/google_apis
-/links
-/links.db
-/mojo
-/native_client
-/net
-/out
-/sde-avx-sse-transition-out.txt
-/testing
-/third_party/android_platform
-/third_party/android_tools
-/third_party/appurify-python
-/third_party/asan
-/third_party/ashmem
-/third_party/binutils
-/third_party/BUILD.gn
-/third_party/catapult
-/third_party/drmemory
-/third_party/gflags/src
-/third_party/icu
-/third_party/ijar
-/third_party/instrumented_libraries
-/third_party/jsr-305
-/third_party/junit
-/third_party/libjpeg
-/third_party/libjpeg_turbo
-/third_party/libxml
-/third_party/llvm
-/third_party/llvm-build
-/third_party/lss
-/third_party/mockito
-/third_party/modp_b64
-/third_party/protobuf
-/third_party/requests
-/third_party/robolectric
-/third_party/WebKit
-/third_party/yasm
-/tools/android
-/tools/clang
-/tools/generate_library_loader
-/tools/gn
-/tools/grit
-/tools/gritsettings/README
-/tools/gritsettings/resource_ids
-/tools/gyp
-/tools/isolate_driver.py
-/tools/memory
-/tools/protoc_wrapper
-/tools/python
-/tools/sanitizer_options
-/tools/swarming_client
-/tools/tsan_suppressions
-/tools/valgrind
-/tools/valgrind-libyuv/libyuv_tests.bat
-/tools/valgrind-libyuv/libyuv_tests.py
-/tools/valgrind-libyuv/libyuv_tests.sh
-/tools/valgrind-libyuv/memcheck/OWNERS
-/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
-/tools/valgrind-libyuv/memcheck/suppressions.txt
-/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
-/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
-/tools/valgrind-libyuv/tsan/OWNERS
-/tools/valgrind-libyuv/tsan/PRESUBMIT.py
-/tools/valgrind-libyuv/tsan/suppressions.txt
-/tools/valgrind-libyuv/tsan/suppressions_mac.txt
-/tools/valgrind-libyuv/tsan/suppressions_win32.txt
-/tools/vim
-/tools/win
-
-# Files generated by CMake build
-cmake_install.cmake
-CMakeCache.txt
-CMakeFiles/
-convert
-libgtest.a
-libyuv.a
-libyuv_unittest
-
-# Files generated by winarm.mk build
-libyuv_arm.lib
-source/*.o
-
-# Files generated by perf
-perf.data
-perf.data.old
diff --git a/files/Android.bp b/files/Android.bp
deleted file mode 100644
index 20b8c234..00000000
--- a/files/Android.bp
+++ /dev/null
@@ -1,179 +0,0 @@
-package {
- default_applicable_licenses: ["external_libyuv_files_license"],
-}
-
-// Added automatically by a large-scale-change
-//
-// large-scale-change included anything that looked like it might be a license
-// text as a license_text. e.g. LICENSE, NOTICE, COPYING etc.
-//
-// Please consider removing redundant or irrelevant files from 'license_text:'.
-// See: http://go/android-license-faq
-license {
- name: "external_libyuv_files_license",
- visibility: [":__subpackages__"],
- license_kinds: [
- "SPDX-license-identifier-BSD",
- ],
- license_text: [
- "LICENSE",
- "LICENSE_THIRD_PARTY",
- "PATENTS",
- ],
-}
-
-cc_library {
- name: "libyuv",
- vendor_available: true,
- product_available: true,
- host_supported: true,
- vndk: {
- enabled: true,
- },
-
- srcs: [
- "source/compare.cc",
- "source/compare_common.cc",
- "source/compare_gcc.cc",
- "source/compare_neon.cc",
- "source/compare_neon64.cc",
- "source/compare_mmi.cc",
- "source/compare_msa.cc",
- "source/convert.cc",
- "source/convert_argb.cc",
- "source/convert_from.cc",
- "source/convert_from_argb.cc",
- "source/convert_to_argb.cc",
- "source/convert_to_i420.cc",
- "source/cpu_id.cc",
- "source/planar_functions.cc",
- "source/rotate.cc",
- "source/rotate_any.cc",
- "source/rotate_argb.cc",
- "source/rotate_common.cc",
- "source/rotate_gcc.cc",
- "source/rotate_mmi.cc",
- "source/rotate_msa.cc",
- "source/rotate_neon.cc",
- "source/rotate_neon64.cc",
- "source/row_any.cc",
- "source/row_common.cc",
- "source/row_gcc.cc",
- "source/row_mmi.cc",
- "source/row_msa.cc",
- "source/row_neon.cc",
- "source/row_neon64.cc",
- "source/scale.cc",
- "source/scale_any.cc",
- "source/scale_argb.cc",
- "source/scale_common.cc",
- "source/scale_gcc.cc",
- "source/scale_mmi.cc",
- "source/scale_msa.cc",
- "source/scale_neon.cc",
- "source/scale_neon64.cc",
- "source/video_common.cc",
- "source/convert_jpeg.cc",
- "source/mjpeg_decoder.cc",
- "source/mjpeg_validate.cc",
- ],
-
- cflags: [
- "-Wall",
- "-Werror",
- "-Wno-unused-parameter",
- "-fexceptions",
- "-DHAVE_JPEG",
- ],
-
- arch: {
- arm: {
- cflags: ["-mfpu=neon"],
- },
- },
-
- shared_libs: ["libjpeg"],
-
- export_include_dirs: ["include"],
-
- apex_available: [
- "//apex_available:platform",
- "com.android.media.swcodec",
- ],
- min_sdk_version: "29",
-}
-
-// compatibilty static library until all uses of libyuv_static are replaced
-// with libyuv (b/37646797)
-cc_library_static {
- name: "libyuv_static",
- vendor_available: true,
- whole_static_libs: ["libyuv"],
- apex_available: [
- "//apex_available:platform",
- "com.android.media.swcodec",
- ],
- min_sdk_version: "29",
-}
-
-cc_test {
- name: "libyuv_unittest",
- static_libs: ["libyuv"],
- shared_libs: ["libjpeg"],
- cflags: ["-Wall", "-Werror"],
- srcs: [
- "unit_test/unit_test.cc",
- "unit_test/basictypes_test.cc",
- "unit_test/color_test.cc",
- "unit_test/compare_test.cc",
- "unit_test/convert_test.cc",
- "unit_test/cpu_test.cc",
- "unit_test/cpu_thread_test.cc",
- "unit_test/math_test.cc",
- "unit_test/planar_test.cc",
- "unit_test/rotate_argb_test.cc",
- "unit_test/rotate_test.cc",
- "unit_test/scale_argb_test.cc",
- "unit_test/scale_test.cc",
- "unit_test/video_common_test.cc",
- ],
-}
-
-cc_test {
- name: "compare",
- gtest: false,
- srcs: [
- "util/compare.cc",
- ],
- static_libs: ["libyuv"],
-}
-
-cc_test {
- name: "cpuid",
- gtest: false,
- srcs: [
- "util/cpuid.c",
- ],
- static_libs: ["libyuv"],
-}
-
-cc_test {
- name: "psnr",
- gtest: false,
- srcs: [
- "util/psnr_main.cc",
- "util/psnr.cc",
- "util/ssim.cc",
- ],
- static_libs: ["libyuv"],
-}
-
-cc_test {
- name: "yuvconvert",
- gtest: false,
- srcs: [
- "util/yuvconvert.cc",
- ],
- static_libs: ["libyuv"],
- shared_libs: ["libjpeg"],
-}
diff --git a/files/DEPS b/files/DEPS
deleted file mode 100644
index c5f81b86..00000000
--- a/files/DEPS
+++ /dev/null
@@ -1,1096 +0,0 @@
-vars = {
- 'chromium_git': 'https://chromium.googlesource.com',
- 'chromium_revision': '4476bd69d1c8e4e1cde8633d3b33c992f7d3a6d0',
- 'swarming_revision': '0e3e1c4dc4e79f25a5b58fcbc135dc93183c0c54',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling lss
- # and whatever else without interference from each other.
- 'lss_revision': 'e6527b0cd469e3ff5764785dadcb39bf7d787154',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling catapult
- # and whatever else without interference from each other.
- 'catapult_revision': 'a24a725f7834c16b3628bfb63f349b3480bf9592',
- # the commit queue can handle CLs rolling android_sdk_build-tools_version
- # and whatever else without interference from each other.
- 'android_sdk_build-tools_version': 'DLK621q5_Bga5EsOr7cp6bHWWxFKx6UHLu_Ix_m3AckC',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_emulator_version
- # and whatever else without interference from each other.
- 'android_sdk_emulator_version': 'ki7EDQRAiZAUYlnTWR1XmI6cJTk65fJ-DNZUU1zrtS8C',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_extras_version
- # and whatever else without interference from each other.
- 'android_sdk_extras_version': 'iIwhhDox5E-mHgwUhCz8JACWQCpUjdqt5KTY9VLugKQC',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_patcher_version
- # and whatever else without interference from each other.
- 'android_sdk_patcher_version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_platform-tools_version
- # and whatever else without interference from each other.
- 'android_sdk_platform-tools_version': '4Y2Cb2LGzoc-qt-oIUIlhySotJaKeE3ELFedSVe6Uk8C',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_platforms_version
- # and whatever else without interference from each other.
- 'android_sdk_platforms_version': 'Kg2t9p0YnQk8bldUv4VA3o156uPXLUfIFAmVZ-Gm5ewC',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_sources_version
- # and whatever else without interference from each other.
- 'android_sdk_sources_version': 'K9uEn3JvNELEVjjVK_GQD3ZQD3rqAnJSxCWxjmUmRkgC',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_tools_version
- # and whatever else without interference from each other.
- 'android_sdk_tools_version': 'wYcRQC2WHsw2dKWs4EA7fw9Qsyzu1ds1_fRjKmGxe5QC',
- # Three lines of non-changing comments so that
- # the commit queue can handle CLs rolling android_sdk_tools-lint_version
- # and whatever else without interference from each other.
- 'android_sdk_tools-lint_version': '89hXqZYzCum3delB5RV7J_QyWkaRodqdtQS0s3LMh3wC',
-}
-
-deps = {
- 'src/build':
- Var('chromium_git') + '/chromium/src/build' + '@' + '669e41d6f18842ed5740449662a71b715dc607c6',
- 'src/buildtools':
- Var('chromium_git') + '/chromium/buildtools.git' + '@' + '0e1cbc4eab6861b0c84bf2ed9a3c4b7aa2063819',
- 'src/testing':
- Var('chromium_git') + '/chromium/src/testing' + '@' + 'b1c6aeebeabcc177a83ff0a33dc6c3ab03d4aa94',
- 'src/third_party':
- Var('chromium_git') + '/chromium/src/third_party' + '@' + 'be3e0fc18f2e9ea14d0e9369e539eae5986335fd',
- 'src/third_party/catapult':
- Var('chromium_git') + '/catapult.git' + '@' + Var('catapult_revision'),
- 'src/third_party/colorama/src':
- Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
- 'src/third_party/freetype/src':
- Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'd01e28f41f8810c8ea422b854f8722659589fa99',
- 'src/third_party/googletest/src':
- Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '879ac092fde0a19e1b3a61b2546b2a422b1528bc',
- 'src/third_party/harfbuzz-ng/src':
- Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '26c5b54fb09fb45e02c9c4618bcea4958c698953',
- 'src/third_party/libjpeg_turbo':
- Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '61a2bbaa9aec89cb2c882d87ace6aba9aee49bb9',
- 'src/third_party/yasm/source/patched-yasm':
- Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
- 'src/tools':
- Var('chromium_git') + '/chromium/src/tools' + '@' + '419541c8352b3b75a99c9a5a7c0d1e7b92f3fcf7',
- 'src/tools/swarming_client':
- Var('chromium_git') + '/infra/luci/client-py.git' + '@' + Var('swarming_revision'),
-
- # libyuv-only dependencies (not present in Chromium).
- 'src/third_party/gflags':
- Var('chromium_git') + '/external/webrtc/deps/third_party/gflags' + '@' + '892576179b45861b53e04a112996a738309cf364',
- 'src/third_party/gflags/src':
- Var('chromium_git') + '/external/github.com/gflags/gflags' + '@' + '03bebcb065c83beff83d50ae025a55a4bf94dfca',
- 'src/third_party/gtest-parallel':
- Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
-
- 'src/third_party/lss': {
- 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
- 'condition': 'checkout_android or checkout_linux',
- },
-
- # Android deps:
- 'src/third_party/accessibility_test_framework': {
- 'packages': [
- {
- 'package': 'chromium/third_party/accessibility-test-framework',
- 'version': 'version:2.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/auto/src': {
- 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82',
- 'condition': 'checkout_android',
- },
- 'src/base': {
- 'url': Var('chromium_git') + '/chromium/src/base' + '@' + '162a5d66ad148f26bbbe6b6ecaf5c1bafa2173e6',
- 'condition': 'checkout_android',
- },
- 'src/third_party/bazel': {
- 'packages': [
- {
- 'package': 'chromium/third_party/bazel',
- 'version': 'version:0.10.0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/bouncycastle': {
- 'packages': [
- {
- 'package': 'chromium/third_party/bouncycastle',
- 'version': 'version:1.46-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/android_ndk': {
- 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '4e2cea441bfd43f0863d14f57b1e1844260b9884',
- 'condition': 'checkout_android',
- },
- 'src/third_party/android_support_test_runner': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_support_test_runner',
- 'version': 'version:0.5-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/android_tools': {
- 'url': Var('chromium_git') + '/android_tools.git' + '@' + 'e958d6ea74442d4e0849bb8a018d215a0e78981d',
- 'condition': 'checkout_android',
- },
- 'src/third_party/android_sdk/public': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_sdk/public/build-tools',
- 'version': Var('android_sdk_build-tools_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/emulator',
- 'version': Var('android_sdk_emulator_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/extras',
- 'version': Var('android_sdk_extras_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/patcher',
- 'version': Var('android_sdk_patcher_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/platform-tools',
- 'version': Var('android_sdk_platform-tools_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/platforms',
- 'version': Var('android_sdk_platforms_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/sources',
- 'version': Var('android_sdk_sources_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/tools',
- 'version': Var('android_sdk_tools_version'),
- },
- {
- 'package': 'chromium/third_party/android_sdk/public/tools-lint',
- 'version': Var('android_sdk_tools-lint_version'),
- },
- ],
- 'condition': 'checkout_android_native_support',
- 'dep_type': 'cipd',
- },
- 'src/third_party/android_build_tools/aapt2': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_tools_aapt2',
- 'version': 'version:3.2.0-alpha18-4804415-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/byte_buddy': {
- 'packages': [
- {
- 'package': 'chromium/third_party/byte_buddy',
- 'version': 'version:1.4.17-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/ced/src': {
- 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '94c367a1fe3a13207f4b22604fcfd1d9f9ddf6d9',
- 'condition': 'checkout_android',
- },
- 'src/third_party/errorprone/lib': {
- 'url': Var('chromium_git') + '/chromium/third_party/errorprone.git' + '@' + '980d49e839aa4984015efed34b0134d4b2c9b6d7',
- 'condition': 'checkout_android',
- },
- 'src/third_party/findbugs': {
- 'url': Var('chromium_git') + '/chromium/deps/findbugs.git' + '@' + '4275d9ac8610db6b1bc9a5e887f97e41b33fac67',
- 'condition': 'checkout_android',
- },
- 'src/third_party/gson': {
- 'packages': [
- {
- 'package': 'chromium/third_party/gson',
- 'version': 'version:2.8.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/guava': {
- 'packages': [
- {
- 'package': 'chromium/third_party/guava',
- 'version': 'version:23.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/hamcrest': {
- 'packages': [
- {
- 'package': 'chromium/third_party/hamcrest',
- 'version': 'version:1.3-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/icu': {
- 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'd65301491c513d49163ad29c853eb85c02c8d5b4',
- },
- 'src/third_party/icu4j': {
- 'packages': [
- {
- 'package': 'chromium/third_party/icu4j',
- 'version': 'version:53.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/intellij': {
- 'packages': [
- {
- 'package': 'chromium/third_party/intellij',
- 'version': 'version:12.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/jsr-305/src': {
- 'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
- 'condition': 'checkout_android',
- },
- 'src/third_party/junit/src': {
- 'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
- 'condition': 'checkout_android',
- },
- 'src/third_party/mockito/src': {
- 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
- 'condition': 'checkout_android',
- },
- 'src/third_party/objenesis': {
- 'packages': [
- {
- 'package': 'chromium/third_party/objenesis',
- 'version': 'version:2.4-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/ow2_asm': {
- 'packages': [
- {
- 'package': 'chromium/third_party/ow2_asm',
- 'version': 'version:5.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/r8': {
- 'packages': [
- {
- 'package': 'chromium/third_party/r8',
- 'version': 'version:1.0.30',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/proguard': {
- 'packages': [
- {
- 'package': 'chromium/third_party/proguard',
- 'version': '3bd778c422ea5496de2ef25c007a517dbb5ce5ca',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/requests/src': {
- 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
- 'condition': 'checkout_android',
- },
- 'src/third_party/robolectric': {
- 'packages': [
- {
- 'package': 'chromium/third_party/robolectric',
- 'version': 'version:3.5.1',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/robolectric/robolectric': {
- 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '7e067f1112e1502caa742f7be72d37b5678d3403',
- 'condition': 'checkout_android',
- },
- 'src/third_party/sqlite4java': {
- 'packages': [
- {
- 'package': 'chromium/third_party/sqlite4java',
- 'version': 'version:0.282-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
- 'src/third_party/ub-uiautomator/lib': {
- 'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
- 'condition': 'checkout_android',
- },
- 'src/third_party/xstream': {
- 'packages': [
- {
- 'package': 'chromium/third_party/xstream',
- 'version': 'version:1.4.8-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- # iOS deps:
- 'src/ios': {
- 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '44be3c093cf2db7ab4cf1997d6a1a07722f1f391',
- 'condition': 'checkout_ios'
- },
-
- # Win deps:
- # Dependencies used by libjpeg-turbo
- 'src/third_party/yasm/binaries': {
- 'url': Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881',
- 'condition': 'checkout_win',
- },
-
- # === ANDROID_DEPS Generated Code Start ===
- # Generated by //tools/android/roll/android_deps/fetch_all.sh
- 'src/third_party/android_deps/libs/android_arch_core_common': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
- 'version': 'version:1.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
- 'version': 'version:1.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
- 'version': 'version:1.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_design': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_gridlayout_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_gridlayout_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_leanback_v17': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_leanback_v17',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_mediarouter_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_mediarouter_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_multidex': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
- 'version': 'version:1.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_palette_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_palette_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_preference_leanback_v17': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_leanback_v17',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_preference_v14': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v14',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_preference_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_annotations': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_compat': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_fragment': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_v13': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v13',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_v4': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_android_support_transition': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
- 'version': 'version:27.0.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
- 'version': 'version:12.0.1-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_google_android_play_core': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core',
- 'version': 'version:1.3.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- 'src/third_party/android_deps/libs/com_squareup_javapoet': {
- 'packages': [
- {
- 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
- 'version': 'version:1.11.0-cr0',
- },
- ],
- 'condition': 'checkout_android',
- 'dep_type': 'cipd',
- },
-
- # === ANDROID_DEPS Generated Code End ===
-}
-
-# Define rules for which include paths are allowed in our source.
-include_rules = [ '+gflags' ]
-
-pre_deps_hooks = [
- {
- # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8.
- # TODO(kjellander): Remove this in March 2017.
- 'name': 'cleanup_links',
- 'pattern': '.',
- 'action': ['python', 'src/cleanup_links.py'],
- },
-]
-
-hooks = [
- {
- # This clobbers when necessary (based on get_landmines.py). It should be
- # an early hook but it will need to be run after syncing Chromium and
- # setting up the links, so the script actually exists.
- 'name': 'landmines',
- 'pattern': '.',
- 'action': [
- 'python',
- 'src/build/landmines.py',
- '--landmine-scripts',
- 'src/tools_libyuv/get_landmines.py',
- '--src-dir',
- 'src',
- ],
- },
- # Downloads the current stable linux sysroot to build/linux/ if needed.
- {
- 'name': 'sysroot_arm',
- 'pattern': '.',
- 'condition': 'checkout_linux and checkout_arm',
- 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
- '--arch=arm'],
- },
- {
- 'name': 'sysroot_arm64',
- 'pattern': '.',
- 'condition': 'checkout_linux and checkout_arm64',
- 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
- '--arch=arm64'],
- },
- {
- 'name': 'sysroot_x86',
- 'pattern': '.',
- 'condition': 'checkout_linux and (checkout_x86 or checkout_x64)',
- 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
- '--arch=x86'],
- },
- {
- 'name': 'sysroot_mips',
- 'pattern': '.',
- 'condition': 'checkout_linux and checkout_mips',
- 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
- '--arch=mips'],
- },
- {
- 'name': 'sysroot_x64',
- 'pattern': '.',
- 'condition': 'checkout_linux and checkout_x64',
- 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
- '--arch=x64'],
- },
- {
- # Update the Windows toolchain if necessary.
- 'name': 'win_toolchain',
- 'pattern': '.',
- 'action': ['python', 'src/build/vs_toolchain.py', 'update'],
- },
- {
- # Update the Mac toolchain if necessary.
- 'name': 'mac_toolchain',
- 'pattern': '.',
- 'action': ['python', 'src/build/mac_toolchain.py'],
- },
- # Pull binutils for linux, enabled debug fission for faster linking /
- # debugging when used with clang on Ubuntu Precise.
- # https://code.google.com/p/chromium/issues/detail?id=352046
- {
- 'name': 'binutils',
- 'pattern': 'src/third_party/binutils',
- 'action': [
- 'python',
- 'src/third_party/binutils/download.py',
- ],
- },
- {
- # Pull clang if needed or requested via GYP_DEFINES.
- # Note: On Win, this should run after win_toolchain, as it may use it.
- 'name': 'clang',
- 'pattern': '.',
- 'action': ['python', 'src/tools/clang/scripts/update.py'],
- },
- {
- # Update LASTCHANGE.
- 'name': 'lastchange',
- 'pattern': '.',
- 'action': ['python', 'src/build/util/lastchange.py',
- '-o', 'src/build/util/LASTCHANGE'],
- },
- # Pull GN binaries.
- {
- 'name': 'gn_win',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=win32',
- '--no_auth',
- '--bucket', 'chromium-gn',
- '-s', 'src/buildtools/win/gn.exe.sha1',
- ],
- },
- {
- 'name': 'gn_mac',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=darwin',
- '--no_auth',
- '--bucket', 'chromium-gn',
- '-s', 'src/buildtools/mac/gn.sha1',
- ],
- },
- {
- 'name': 'gn_linux64',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=linux*',
- '--no_auth',
- '--bucket', 'chromium-gn',
- '-s', 'src/buildtools/linux64/gn.sha1',
- ],
- },
- # Pull clang-format binaries using checked-in hashes.
- {
- 'name': 'clang_format_win',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=win32',
- '--no_auth',
- '--bucket', 'chromium-clang-format',
- '-s', 'src/buildtools/win/clang-format.exe.sha1',
- ],
- },
- {
- 'name': 'clang_format_mac',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=darwin',
- '--no_auth',
- '--bucket', 'chromium-clang-format',
- '-s', 'src/buildtools/mac/clang-format.sha1',
- ],
- },
- {
- 'name': 'clang_format_linux',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=linux*',
- '--no_auth',
- '--bucket', 'chromium-clang-format',
- '-s', 'src/buildtools/linux64/clang-format.sha1',
- ],
- },
- # Pull luci-go binaries (isolate, swarming) using checked-in hashes.
- {
- 'name': 'luci-go_win',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=win32',
- '--no_auth',
- '--bucket', 'chromium-luci',
- '-d', 'src/tools/luci-go/win64',
- ],
- },
- {
- 'name': 'luci-go_mac',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=darwin',
- '--no_auth',
- '--bucket', 'chromium-luci',
- '-d', 'src/tools/luci-go/mac64',
- ],
- },
- {
- 'name': 'luci-go_linux',
- 'pattern': '.',
- 'action': [ 'download_from_google_storage',
- '--no_resume',
- '--platform=linux*',
- '--no_auth',
- '--bucket', 'chromium-luci',
- '-d', 'src/tools/luci-go/linux64',
- ],
- },
- {
- # We used to use src as a CIPD root. We moved it to a different directory
- # in crrev.com/c/930178 but left the clobber here to ensure that that CL
- # could be reverted safely. This can be safely removed once crbug.com/794764
- # is resolved.
- 'name': 'Android Clobber Deprecated CIPD Root',
- 'pattern': '.',
- 'condition': 'checkout_android',
- 'action': ['src/build/cipd/clobber_cipd_root.py',
- '--root', 'src',
- ],
- },
- # Android dependencies. Many are downloaded using Google Storage these days.
- # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
- # such dependencies we share with Chromium.
- {
- # This downloads SDK extras and puts them in the
- # third_party/android_sdk/public/extras directory.
- 'name': 'sdkextras',
- 'condition': 'checkout_android',
- 'pattern': '.',
- 'action': ['vpython',
- 'src/build/android/play_services/update.py',
- 'download'
- ],
- },
-]
-
-recursedeps = [
- # buildtools provides clang_format, libc++, and libc++abi.
- 'src/buildtools',
-]
diff --git a/files/LICENSE b/files/LICENSE
deleted file mode 100644
index c911747a..00000000
--- a/files/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-Copyright 2011 The LibYuv Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
-
- * Neither the name of Google nor the names of its contributors may
- be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/files/LICENSE_THIRD_PARTY b/files/LICENSE_THIRD_PARTY
deleted file mode 100644
index a71591e7..00000000
--- a/files/LICENSE_THIRD_PARTY
+++ /dev/null
@@ -1,8 +0,0 @@
-This source tree contains third party source code which is governed by third
-party licenses. This file contains references to files which are under other
-licenses than the one provided in the LICENSE file in the root of the source
-tree.
-
-Files governed by third party licenses:
-source/x86inc.asm
-
diff --git a/files/all.gyp b/files/all.gyp
deleted file mode 100644
index 88a74842..00000000
--- a/files/all.gyp
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2013 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# all.gyp and All target are for benefit of android gyp build.
-{
- 'targets': [
- {
- 'target_name': 'All',
- 'type': 'none',
- 'dependencies': [
- 'libyuv.gyp:*',
- 'libyuv_test.gyp:*',
- ],
- },
- ],
-}
diff --git a/files/chromium/.gclient b/files/chromium/.gclient
deleted file mode 100644
index c1a86ecf..00000000
--- a/files/chromium/.gclient
+++ /dev/null
@@ -1,20 +0,0 @@
-solutions = [{
- 'name': 'src',
- 'url': 'https://chromium.googlesource.com/chromium/src.git',
- 'deps_file': '.DEPS.git',
- 'managed': False,
- 'custom_deps': {
- # Skip syncing some large dependencies Libyuv will never need.
- 'src/third_party/cld_2/src': None,
- 'src/third_party/ffmpeg': None,
- 'src/third_party/hunspell_dictionaries': None,
- 'src/third_party/liblouis/src': None,
- 'src/third_party/pdfium': None,
- 'src/third_party/skia': None,
- 'src/third_party/trace-viewer': None,
- 'src/third_party/webrtc': None,
- },
- 'safesync_url': ''
-}]
-
-cache_dir = None
diff --git a/files/chromium/README b/files/chromium/README
deleted file mode 100644
index 127f4b52..00000000
--- a/files/chromium/README
+++ /dev/null
@@ -1,5 +0,0 @@
-This .gclient file is used to do download a copy of Chromium.
-Libyuv uses the Chromium build toolchain and a number of shared
-dependencies by creating symlinks to folders in this checkout,
-using the ../setup_links.py script.
-
diff --git a/files/gyp_libyuv b/files/gyp_libyuv
deleted file mode 100755
index 445b924f..00000000
--- a/files/gyp_libyuv
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This script is used to run GYP for libyuv. It contains selected parts of the
-# main function from the src/build/gyp_chromium file.
-
-import glob
-import os
-import shlex
-import sys
-
-checkout_root = os.path.dirname(os.path.realpath(__file__))
-
-sys.path.insert(0, os.path.join(checkout_root, 'build'))
-import gyp_chromium
-import gyp_helper
-import vs_toolchain
-
-sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib'))
-import gyp
-
-def GetSupplementalFiles():
- """Returns a list of the supplemental files that are included in all GYP
- sources."""
- # Can't use the one in gyp_chromium since the directory location of the root
- # is different.
- return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi'))
-
-
-if __name__ == '__main__':
- args = sys.argv[1:]
-
- if int(os.environ.get('GYP_CHROMIUM_NO_ACTION', 0)):
- print 'Skipping gyp_libyuv due to GYP_CHROMIUM_NO_ACTION env var.'
- sys.exit(0)
-
- # This could give false positives since it doesn't actually do real option
- # parsing. Oh well.
- gyp_file_specified = False
- for arg in args:
- if arg.endswith('.gyp'):
- gyp_file_specified = True
- break
-
- # If we didn't get a file, assume 'all.gyp' in the root of the checkout.
- if not gyp_file_specified:
- # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't
- # work, but chdir'ing and adding the relative path does. Spooky :/
- os.chdir(checkout_root)
- args.append('all.gyp')
-
- # There shouldn't be a circular dependency relationship between .gyp files,
- args.append('--no-circular-check')
-
- # Default to ninja unless GYP_GENERATORS is set.
- if not os.environ.get('GYP_GENERATORS'):
- os.environ['GYP_GENERATORS'] = 'ninja'
-
- vs2013_runtime_dll_dirs = None
- if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')):
- vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs()
-
- # Enforce gyp syntax checking. This adds about 20% execution time.
- args.append('--check')
-
- supplemental_includes = gyp_chromium.GetSupplementalFiles()
- gyp_vars_dict = gyp_chromium.GetGypVars(supplemental_includes)
-
- # Automatically turn on crosscompile support for platforms that need it.
- if all(('ninja' in os.environ.get('GYP_GENERATORS', ''),
- gyp_vars_dict.get('OS') in ['android', 'ios'],
- 'GYP_CROSSCOMPILE' not in os.environ)):
- os.environ['GYP_CROSSCOMPILE'] = '1'
-
- args.extend(['-I' + i for i in
- gyp_chromium.additional_include_files(supplemental_includes,
- args)])
-
- # Set the gyp depth variable to the root of the checkout.
- args.append('--depth=' + os.path.relpath(checkout_root))
-
- print 'Updating projects from gyp files...'
- sys.stdout.flush()
-
- # Off we go...
- gyp_rc = gyp.main(args)
-
- if vs2013_runtime_dll_dirs:
- x64_runtime, x86_runtime = vs2013_runtime_dll_dirs
- vs_toolchain.CopyVsRuntimeDlls(
- os.path.join(checkout_root, gyp_chromium.GetOutputDirectory()),
- (x86_runtime, x64_runtime))
-
- sys.exit(gyp_rc)
diff --git a/files/gyp_libyuv.py b/files/gyp_libyuv.py
deleted file mode 100644
index bb32ec39..00000000
--- a/files/gyp_libyuv.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-
-# This script is a modified copy of the src/build/gyp_chromium.py file.
-# It is needed for parallel processing.
-
-# This file is (possibly, depending on python version) imported by
-# gyp_libyuv when GYP_PARALLEL=1 and it creates sub-processes
-# through the multiprocessing library.
-
-# Importing in Python 2.6 (fixed in 2.7) on Windows doesn't search for
-# imports that don't end in .py (and aren't directories with an
-# __init__.py). This wrapper makes "import gyp_libyuv" work with
-# those old versions and makes it possible to execute gyp_libyuv.py
-# directly on Windows where the extension is useful.
-
-import os
-
-path = os.path.abspath(os.path.split(__file__)[0])
-execfile(os.path.join(path, 'gyp_libyuv'))
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
deleted file mode 100644
index f571142f..00000000
--- a/files/include/libyuv/convert.h
+++ /dev/null
@@ -1,504 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_
-#define INCLUDE_LIBYUV_CONVERT_H_
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h" // For enum RotationMode.
-
-// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
-#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
-#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
-#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert I444 to I420.
-LIBYUV_API
-int I444ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert I444 to NV21.
-LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height);
-
-// Convert I422 to I420.
-LIBYUV_API
-int I422ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert I422 to NV21.
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height);
-
-// Copy I420 to I420.
-#define I420ToI420 I420Copy
-LIBYUV_API
-int I420Copy(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Copy I010 to I010
-#define I010ToI010 I010Copy
-#define H010ToH010 I010Copy
-LIBYUV_API
-int I010Copy(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint16_t* dst_y,
- int dst_stride_y,
- uint16_t* dst_u,
- int dst_stride_u,
- uint16_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert 10 bit YUV to 8 bit
-#define H010ToH420 I010ToI420
-LIBYUV_API
-int I010ToI420(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert I400 (grey) to I420.
-LIBYUV_API
-int I400ToI420(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert I400 (grey) to NV21.
-LIBYUV_API
-int I400ToNV21(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height);
-
-#define J400ToJ420 I400ToI420
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert NV21 to I420.
-LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8_t* src_yuy2,
- int src_stride_yuy2,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8_t* src_uyvy,
- int src_stride_uyvy,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert AYUV to NV12.
-LIBYUV_API
-int AYUVToNV12(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_uv,
- int dst_stride_uv,
- int width,
- int height);
-
-// Convert AYUV to NV21.
-LIBYUV_API
-int AYUVToNV21(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height);
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert Android420 to I420.
-LIBYUV_API
-int Android420ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// ARGB little endian (bgra in memory) to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// BGRA little endian (argb in memory) to I420.
-LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// ABGR little endian (rgba in memory) to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGBA little endian (abgr in memory) to I420.
-LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
- int src_stride_rgba,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB little endian (bgr in memory) to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB little endian (bgr in memory) to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB big endian (rgb in memory) to I420.
-LIBYUV_API
-int RAWToI420(const uint8_t* src_raw,
- int src_stride_raw,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB16 (RGBP fourcc) little endian to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB15 (RGBO fourcc) little endian to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB12 (R444 fourcc) little endian to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// RGB little endian (bgr in memory) to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_yj,
- int dst_stride_yj,
- int width,
- int height);
-
-#ifdef HAVE_JPEG
-// src_width/height provided by capture.
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToI420(const uint8_t* sample,
- size_t sample_size,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int src_width,
- int src_height,
- int dst_width,
- int dst_height);
-
-// JPEG to NV21
-LIBYUV_API
-int MJPGToNV21(const uint8_t* sample,
- size_t sample_size,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int src_width,
- int src_height,
- int dst_width,
- int dst_height);
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8_t* sample,
- size_t sample_size,
- int* width,
- int* height);
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
-// "dst_stride_y" number of bytes in a row of the dst_y plane.
-// Normally this would be the same as dst_width, with recommended alignment
-// to 16 bytes for better efficiency.
-// If rotation of 90 or 270 is used, stride is affected. The caller should
-// allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-// Normally this would be the same as (dst_width + 1) / 2, with
-// recommended alignment to 16 bytes for better efficiency.
-// If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-// To center, crop_x = (src_width - dst_width) / 2
-// crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-// "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-// Must be less than or equal to src_width/src_height
-// Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToI420(const uint8_t* sample,
- size_t sample_size,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int crop_x,
- int crop_y,
- int src_width,
- int src_height,
- int crop_width,
- int crop_height,
- enum RotationMode rotation,
- uint32_t fourcc);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
deleted file mode 100644
index e8ed1f59..00000000
--- a/files/include/libyuv/convert_argb.h
+++ /dev/null
@@ -1,721 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
-#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h" // For enum RotationMode.
-
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Alias.
-#define ARGBToARGB ARGBCopy
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Duplicate prototype for function in convert_from.h for remoting.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I010 to ABGR.
-LIBYUV_API
-int I010ToABGR(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert H010 to ABGR.
-LIBYUV_API
-int H010ToABGR(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert I420 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height,
- int attenuate);
-
-// Convert I420 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height,
- int attenuate);
-
-// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
-LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert J400 (jpeg grey) to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Alias.
-#define YToARGB I400ToARGB
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert NV12 to ABGR.
-LIBYUV_API
-int NV12ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert NV21 to ABGR.
-LIBYUV_API
-int NV21ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert NV12 to RGB24.
-LIBYUV_API
-int NV12ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-// Convert NV21 to RGB24.
-LIBYUV_API
-int NV21ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-// Convert NV21 to YUV24.
-LIBYUV_API
-int NV21ToYUV24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_yuv24,
- int dst_stride_yuv24,
- int width,
- int height);
-
-// Convert NV12 to RAW.
-LIBYUV_API
-int NV12ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-// Convert NV21 to RAW.
-LIBYUV_API
-int NV21ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8_t* src_yuy2,
- int src_stride_yuy2,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8_t* src_uyvy,
- int src_stride_uyvy,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I010 to AR30.
-LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
-// Convert H010 to AR30.
-LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
-// Convert I010 to AB30.
-LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
- int width,
- int height);
-
-// Convert H010 to AB30.
-LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
- int width,
- int height);
-
-// BGRA little endian (argb in memory) to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// ABGR little endian (rgba in memory) to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// RGBA little endian (abgr in memory) to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8_t* src_rgba,
- int src_stride_rgba,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Deprecated function name.
-#define BG24ToARGB RGB24ToARGB
-
-// RGB little endian (bgr in memory) to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// RGB big endian (rgb in memory) to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8_t* src_raw,
- int src_stride_raw,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// RGB16 (RGBP fourcc) little endian to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// RGB15 (RGBO fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// RGB12 (R444 fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Aliases
-#define AB30ToARGB AR30ToABGR
-#define AB30ToABGR AR30ToARGB
-#define AB30ToAR30 AR30ToAB30
-
-// Convert AR30 To ARGB.
-LIBYUV_API
-int AR30ToARGB(const uint8_t* src_ar30,
- int src_stride_ar30,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert AR30 To ABGR.
-LIBYUV_API
-int AR30ToABGR(const uint8_t* src_ar30,
- int src_stride_ar30,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert AR30 To AB30.
-LIBYUV_API
-int AR30ToAB30(const uint8_t* src_ar30,
- int src_stride_ar30,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
- int width,
- int height);
-
-#ifdef HAVE_JPEG
-// src_width/height provided by capture
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToARGB(const uint8_t* sample,
- size_t sample_size,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int src_width,
- int src_height,
- int dst_width,
- int dst_height);
-#endif
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert Android420 to ABGR.
-LIBYUV_API
-int Android420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "sample_size" is needed to parse MJPG.
-// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
-// Normally this would be the same as dst_width, with recommended alignment
-// to 16 bytes for better efficiency.
-// If rotation of 90 or 270 is used, stride is affected. The caller should
-// allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-// Normally this would be the same as (dst_width + 1) / 2, with
-// recommended alignment to 16 bytes for better efficiency.
-// If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-// To center, crop_x = (src_width - dst_width) / 2
-// crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-// "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-// Must be less than or equal to src_width/src_height
-// Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToARGB(const uint8_t* sample,
- size_t sample_size,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int crop_x,
- int crop_y,
- int src_width,
- int src_height,
- int crop_width,
- int crop_height,
- enum RotationMode rotation,
- uint32_t fourcc);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
deleted file mode 100644
index c64e0216..00000000
--- a/files/include/libyuv/rotate.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_H_
-#define INCLUDE_LIBYUV_ROTATE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported rotation.
-typedef enum RotationMode {
- kRotate0 = 0, // No rotation.
- kRotate90 = 90, // Rotate 90 degrees clockwise.
- kRotate180 = 180, // Rotate 180 degrees.
- kRotate270 = 270, // Rotate 270 degrees clockwise.
-
- // Deprecated.
- kRotateNone = 0,
- kRotateClockwise = 90,
- kRotateCounterClockwise = 270,
-} RotationModeEnum;
-
-// Rotate I420 frame.
-LIBYUV_API
-int I420Rotate(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height,
- enum RotationMode mode);
-
-// Rotate I444 frame.
-LIBYUV_API
-int I444Rotate(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height,
- enum RotationMode mode);
-
-// Rotate NV12 input and store in I420.
-LIBYUV_API
-int NV12ToI420Rotate(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height,
- enum RotationMode mode);
-
-// Rotate a plane by 0, 90, 180, or 270.
-LIBYUV_API
-int RotatePlane(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height,
- enum RotationMode mode);
-
-// Rotate planes by 90, 180, 270. Deprecated.
-LIBYUV_API
-void RotatePlane90(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height);
-
-LIBYUV_API
-void RotatePlane180(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height);
-
-LIBYUV_API
-void RotatePlane270(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height);
-
-LIBYUV_API
-void RotateUV90(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height);
-
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
-LIBYUV_API
-void RotateUV180(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height);
-
-LIBYUV_API
-void RotateUV270(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-// Deprecated.
-LIBYUV_API
-void TransposePlane(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height);
-
-LIBYUV_API
-void TransposeUV(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
deleted file mode 100644
index 23ba1634..00000000
--- a/files/include/libyuv/scale.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_H_
-#define INCLUDE_LIBYUV_SCALE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported filtering.
-typedef enum FilterMode {
- kFilterNone = 0, // Point sample; Fastest.
- kFilterLinear = 1, // Filter horizontally only.
- kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
- kFilterBox = 3 // Highest quality.
-} FilterModeEnum;
-
-// Scale a YUV plane.
-LIBYUV_API
-void ScalePlane(const uint8_t* src,
- int src_stride,
- int src_width,
- int src_height,
- uint8_t* dst,
- int dst_stride,
- int dst_width,
- int dst_height,
- enum FilterMode filtering);
-
-LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
- int src_stride,
- int src_width,
- int src_height,
- uint16_t* dst,
- int dst_stride,
- int dst_width,
- int dst_height,
- enum FilterMode filtering);
-
-// Scales a YUV 4:2:0 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I420Scale(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_width,
- int src_height,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int dst_width,
- int dst_height,
- enum FilterMode filtering);
-
-LIBYUV_API
-int I420Scale_16(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- int src_width,
- int src_height,
- uint16_t* dst_y,
- int dst_stride_y,
- uint16_t* dst_u,
- int dst_stride_u,
- uint16_t* dst_v,
- int dst_stride_v,
- int dst_width,
- int dst_height,
- enum FilterMode filtering);
-
-// Scales a YUV 4:4:4 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I444Scale(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_width,
- int src_height,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int dst_width,
- int dst_height,
- enum FilterMode filtering);
-
-LIBYUV_API
-int I444Scale_16(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- int src_width,
- int src_height,
- uint16_t* dst_y,
- int dst_stride_y,
- uint16_t* dst_u,
- int dst_stride_u,
- uint16_t* dst_v,
- int dst_stride_v,
- int dst_width,
- int dst_height,
- enum FilterMode filtering);
-
-#ifdef __cplusplus
-// Legacy API. Deprecated.
-LIBYUV_API
-int Scale(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- int src_stride_y,
- int src_stride_u,
- int src_stride_v,
- int src_width,
- int src_height,
- uint8_t* dst_y,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int dst_stride_y,
- int dst_stride_u,
- int dst_stride_v,
- int dst_width,
- int dst_height,
- LIBYUV_BOOL interpolate);
-
-// For testing, allow disabling of specialized scalers.
-LIBYUV_API
-void SetUseReferenceImpl(LIBYUV_BOOL use);
-#endif // __cplusplus
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/files/infra/config/PRESUBMIT.py b/files/infra/config/PRESUBMIT.py
deleted file mode 100644
index 89eaa519..00000000
--- a/files/infra/config/PRESUBMIT.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2018 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-
-def CheckChangeOnUpload(input_api, output_api):
- return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
-
-
-def CheckChangeOnCommit(input_api, output_api):
- return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/files/infra/config/README.md b/files/infra/config/README.md
deleted file mode 100644
index c036d610..00000000
--- a/files/infra/config/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This directory contains configuration files for infra services.
diff --git a/files/infra/config/cq.cfg b/files/infra/config/cq.cfg
deleted file mode 100644
index 7bcc0595..00000000
--- a/files/infra/config/cq.cfg
+++ /dev/null
@@ -1,51 +0,0 @@
-# Commit Queue configuration file. The documentation of the format can be found
-# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg.
-
-version: 1
-cq_status_url: "https://chromium-cq-status.appspot.com"
-git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git"
-
-gerrit {}
-
-verifiers {
- gerrit_cq_ability {
- committer_list: "project-libyuv-committers"
- dry_run_access_list: "project-libyuv-tryjob-access"
- }
-
- try_job {
- buckets {
- name: "luci.libyuv.try"
- builders { name: "win" }
- builders { name: "win_rel" }
- builders { name: "win_x64_rel" }
- builders { name: "win_clang" }
- builders { name: "win_clang_rel" }
- builders { name: "win_x64_clang_rel" }
- builders { name: "mac" }
- builders { name: "mac_rel" }
- builders { name: "mac_asan" }
- builders { name: "ios" }
- builders { name: "ios_rel" }
- builders { name: "ios_arm64" }
- builders { name: "ios_arm64_rel" }
- builders { name: "linux" }
- builders { name: "linux_rel" }
- builders {
- name: "linux_gcc"
- experiment_percentage: 100
- }
- builders { name: "linux_memcheck" }
- builders { name: "linux_tsan2" }
- builders { name: "linux_asan" }
- builders { name: "linux_msan" }
- builders { name: "linux_ubsan" }
- builders { name: "linux_ubsan_vptr" }
- builders { name: "android" }
- builders { name: "android_rel" }
- builders { name: "android_arm64" }
- builders { name: "android_x86" }
- builders { name: "android_x64" }
- }
- }
-}
diff --git a/files/libyuv_nacl.gyp b/files/libyuv_nacl.gyp
deleted file mode 100644
index b8fe57ee..00000000
--- a/files/libyuv_nacl.gyp
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-{
- 'includes': [
- 'libyuv.gypi',
- '../../native_client/build/untrusted.gypi',
- ],
- 'targets': [
- {
- 'target_name': 'libyuv_nacl',
- 'type': 'none',
- 'variables': {
- 'nlib_target': 'libyuv_nacl.a',
- 'build_glibc': 0,
- 'build_newlib': 0,
- 'build_pnacl_newlib': 1,
- },
- 'include_dirs': [
- 'include',
- ],
- 'direct_dependent_settings': {
- 'include_dirs': [
- 'include',
- ],
- },
- 'sources': [
- '<@(libyuv_sources)',
- ],
- }, # target libyuv_nacl
- ]
-}
diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp
deleted file mode 100644
index 87e7a5bb..00000000
--- a/files/libyuv_test.gyp
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2011 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-{
- 'variables': {
- # Can be enabled if your jpeg has GYP support.
- 'libyuv_disable_jpeg%': 1,
- 'mips_msa%': 0, # Default to msa off.
- },
- 'targets': [
- {
- 'target_name': 'libyuv_unittest',
- 'type': '<(gtest_target_type)',
- 'dependencies': [
- 'libyuv.gyp:libyuv',
- 'testing/gtest.gyp:gtest',
- 'third_party/gflags/gflags.gyp:gflags',
- ],
- 'direct_dependent_settings': {
- 'defines': [
- 'GTEST_RELATIVE_PATH',
- ],
- },
- 'export_dependent_settings': [
- '<(DEPTH)/testing/gtest.gyp:gtest',
- ],
- 'sources': [
- # headers
- 'unit_test/unit_test.h',
-
- # sources
- 'unit_test/basictypes_test.cc',
- 'unit_test/compare_test.cc',
- 'unit_test/color_test.cc',
- 'unit_test/convert_test.cc',
- 'unit_test/cpu_test.cc',
- 'unit_test/math_test.cc',
- 'unit_test/planar_test.cc',
- 'unit_test/rotate_argb_test.cc',
- 'unit_test/rotate_test.cc',
- 'unit_test/scale_argb_test.cc',
- 'unit_test/scale_test.cc',
- 'unit_test/unit_test.cc',
- 'unit_test/video_common_test.cc',
- ],
- 'conditions': [
- ['OS=="linux"', {
- 'cflags': [
- '-fexceptions',
- ],
- }],
- [ 'OS == "ios"', {
- 'xcode_settings': {
- 'DEBUGGING_SYMBOLS': 'YES',
- 'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
- # Work around compile issue with isosim.mm, see
- # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
- 'WARNING_CFLAGS': [
- '-Wno-sometimes-uninitialized',
- ],
- },
- 'cflags': [
- '-Wno-sometimes-uninitialized',
- ],
- }],
- [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
- 'defines': [
- 'HAVE_JPEG',
- ],
- }],
- ['OS=="android"', {
- 'dependencies': [
- '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
- ],
- }],
- # TODO(YangZhang): These lines can be removed when high accuracy
- # YUV to RGB to Neon is ported.
- [ '(target_arch == "armv7" or target_arch == "armv7s" \
- or (target_arch == "arm" and arm_version >= 7) \
- or target_arch == "arm64") \
- and (arm_neon == 1 or arm_neon_optional == 1)', {
- 'defines': [
- 'LIBYUV_NEON'
- ],
- }],
- [ '(target_arch == "mipsel" or target_arch == "mips64el") \
- and (mips_msa == 1)', {
- 'defines': [
- 'LIBYUV_MSA'
- ],
- }],
- ], # conditions
- 'defines': [
- # Enable the following 3 macros to turn off assembly for specified CPU.
- # 'LIBYUV_DISABLE_X86',
- # 'LIBYUV_DISABLE_NEON',
- # 'LIBYUV_DISABLE_DSPR2',
- # Enable the following macro to build libyuv as a shared library (dll).
- # 'LIBYUV_USING_SHARED_LIBRARY',
- ],
- },
- {
- 'target_name': 'compare',
- 'type': 'executable',
- 'dependencies': [
- 'libyuv.gyp:libyuv',
- ],
- 'sources': [
- # sources
- 'util/compare.cc',
- ],
- 'conditions': [
- ['OS=="linux"', {
- 'cflags': [
- '-fexceptions',
- ],
- }],
- ], # conditions
- },
- {
- 'target_name': 'yuvconvert',
- 'type': 'executable',
- 'dependencies': [
- 'libyuv.gyp:libyuv',
- ],
- 'sources': [
- # sources
- 'util/yuvconvert.cc',
- ],
- 'conditions': [
- ['OS=="linux"', {
- 'cflags': [
- '-fexceptions',
- ],
- }],
- ], # conditions
- },
- # TODO(fbarchard): Enable SSE2 and OpenMP for better performance.
- {
- 'target_name': 'psnr',
- 'type': 'executable',
- 'sources': [
- # sources
- 'util/psnr_main.cc',
- 'util/psnr.cc',
- 'util/ssim.cc',
- ],
- 'dependencies': [
- 'libyuv.gyp:libyuv',
- ],
- 'conditions': [
- [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
- 'defines': [
- 'HAVE_JPEG',
- ],
- }],
- ], # conditions
- },
-
- {
- 'target_name': 'cpuid',
- 'type': 'executable',
- 'sources': [
- # sources
- 'util/cpuid.c',
- ],
- 'dependencies': [
- 'libyuv.gyp:libyuv',
- ],
- },
- ], # targets
- 'conditions': [
- ['OS=="android"', {
- 'targets': [
- {
- 'target_name': 'yuv_unittest_apk',
- 'type': 'none',
- 'variables': {
- 'test_suite_name': 'yuv_unittest',
- 'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
- },
- 'includes': [
- 'build/apk_test.gypi',
- ],
- 'dependencies': [
- 'libyuv_unittest',
- ],
- },
- ],
- }],
- ],
-}
-
-# Local Variables:
-# tab-width:2
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/files/public.mk b/files/public.mk
deleted file mode 100644
index 1342307a..00000000
--- a/files/public.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-# This file contains all the common make variables which are useful for
-# anyone depending on this library.
-# Note that dependencies on NDK are not directly listed since NDK auto adds
-# them.
-
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
-
-LIBYUV_C_FLAGS :=
-
-LIBYUV_CPP_FLAGS :=
-
-LIBYUV_LDLIBS :=
-LIBYUV_DEP_MODULES :=
diff --git a/files/setup_links.py b/files/setup_links.py
deleted file mode 100755
index b2b459e6..00000000
--- a/files/setup_links.py
+++ /dev/null
@@ -1,497 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Setup links to a Chromium checkout for WebRTC.
-
-WebRTC standalone shares a lot of dependencies and build tools with Chromium.
-To do this, many of the paths of a Chromium checkout is emulated by creating
-symlinks to files and directories. This script handles the setup of symlinks to
-achieve this.
-
-It also handles cleanup of the legacy Subversion-based approach that was used
-before Chrome switched over their master repo from Subversion to Git.
-"""
-
-
-import ctypes
-import errno
-import logging
-import optparse
-import os
-import shelve
-import shutil
-import subprocess
-import sys
-import textwrap
-
-
-DIRECTORIES = [
- 'build',
- 'buildtools',
- 'mojo', # TODO(kjellander): Remove, see webrtc:5629.
- 'native_client',
- 'net',
- 'testing',
- 'third_party/binutils',
- 'third_party/drmemory',
- 'third_party/instrumented_libraries',
- 'third_party/libjpeg',
- 'third_party/libjpeg_turbo',
- 'third_party/llvm-build',
- 'third_party/lss',
- 'third_party/yasm',
- 'third_party/WebKit', # TODO(kjellander): Remove, see webrtc:5629.
- 'tools/clang',
- 'tools/gn',
- 'tools/gyp',
- 'tools/memory',
- 'tools/python',
- 'tools/swarming_client',
- 'tools/valgrind',
- 'tools/vim',
- 'tools/win',
-]
-
-from sync_chromium import get_target_os_list
-target_os = get_target_os_list()
-if 'android' in target_os:
- DIRECTORIES += [
- 'base',
- 'third_party/android_platform',
- 'third_party/android_tools',
- 'third_party/appurify-python',
- 'third_party/ashmem',
- 'third_party/catapult',
- 'third_party/icu',
- 'third_party/ijar',
- 'third_party/jsr-305',
- 'third_party/junit',
- 'third_party/libxml',
- 'third_party/mockito',
- 'third_party/modp_b64',
- 'third_party/protobuf',
- 'third_party/requests',
- 'third_party/robolectric',
- 'tools/android',
- 'tools/grit',
- ]
-if 'ios' in target_os:
- DIRECTORIES.append('third_party/class-dump')
-
-FILES = {
- 'tools/isolate_driver.py': None,
- 'third_party/BUILD.gn': None,
-}
-
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHROMIUM_CHECKOUT = os.path.join('chromium', 'src')
-LINKS_DB = 'links'
-
-# Version management to make future upgrades/downgrades easier to support.
-SCHEMA_VERSION = 1
-
-
-def query_yes_no(question, default=False):
- """Ask a yes/no question via raw_input() and return their answer.
-
- Modified from http://stackoverflow.com/a/3041990.
- """
- prompt = " [%s/%%s]: "
- prompt = prompt % ('Y' if default is True else 'y')
- prompt = prompt % ('N' if default is False else 'n')
-
- if default is None:
- default = 'INVALID'
-
- while True:
- sys.stdout.write(question + prompt)
- choice = raw_input().lower()
- if choice == '' and default != 'INVALID':
- return default
-
- if 'yes'.startswith(choice):
- return True
- elif 'no'.startswith(choice):
- return False
-
- print "Please respond with 'yes' or 'no' (or 'y' or 'n')."
-
-
-# Actions
-class Action(object):
- def __init__(self, dangerous):
- self.dangerous = dangerous
-
- def announce(self, planning):
- """Log a description of this action.
-
- Args:
- planning - True iff we're in the planning stage, False if we're in the
- doit stage.
- """
- pass
-
- def doit(self, links_db):
- """Execute the action, recording what we did to links_db, if necessary."""
- pass
-
-
-class Remove(Action):
- def __init__(self, path, dangerous):
- super(Remove, self).__init__(dangerous)
- self._priority = 0
- self._path = path
-
- def announce(self, planning):
- log = logging.warn
- filesystem_type = 'file'
- if not self.dangerous:
- log = logging.info
- filesystem_type = 'link'
- if planning:
- log('Planning to remove %s: %s', filesystem_type, self._path)
- else:
- log('Removing %s: %s', filesystem_type, self._path)
-
- def doit(self, _):
- os.remove(self._path)
-
-
-class Rmtree(Action):
- def __init__(self, path):
- super(Rmtree, self).__init__(dangerous=True)
- self._priority = 0
- self._path = path
-
- def announce(self, planning):
- if planning:
- logging.warn('Planning to remove directory: %s', self._path)
- else:
- logging.warn('Removing directory: %s', self._path)
-
- def doit(self, _):
- if sys.platform.startswith('win'):
- # shutil.rmtree() doesn't work on Windows if any of the directories are
- # read-only, which svn repositories are.
- subprocess.check_call(['rd', '/q', '/s', self._path], shell=True)
- else:
- shutil.rmtree(self._path)
-
-
-class Makedirs(Action):
- def __init__(self, path):
- super(Makedirs, self).__init__(dangerous=False)
- self._priority = 1
- self._path = path
-
- def doit(self, _):
- try:
- os.makedirs(self._path)
- except OSError as e:
- if e.errno != errno.EEXIST:
- raise
-
-
-class Symlink(Action):
- def __init__(self, source_path, link_path):
- super(Symlink, self).__init__(dangerous=False)
- self._priority = 2
- self._source_path = source_path
- self._link_path = link_path
-
- def announce(self, planning):
- if planning:
- logging.info(
- 'Planning to create link from %s to %s', self._link_path,
- self._source_path)
- else:
- logging.debug(
- 'Linking from %s to %s', self._link_path, self._source_path)
-
- def doit(self, links_db):
- # Files not in the root directory need relative path calculation.
- # On Windows, use absolute paths instead since NTFS doesn't seem to support
- # relative paths for symlinks.
- if sys.platform.startswith('win'):
- source_path = os.path.abspath(self._source_path)
- else:
- if os.path.dirname(self._link_path) != self._link_path:
- source_path = os.path.relpath(self._source_path,
- os.path.dirname(self._link_path))
-
- os.symlink(source_path, os.path.abspath(self._link_path))
- links_db[self._source_path] = self._link_path
-
-
-class LinkError(IOError):
- """Failed to create a link."""
- pass
-
-
-# Handles symlink creation on the different platforms.
-if sys.platform.startswith('win'):
- def symlink(source_path, link_path):
- flag = 1 if os.path.isdir(source_path) else 0
- if not ctypes.windll.kernel32.CreateSymbolicLinkW(
- unicode(link_path), unicode(source_path), flag):
- raise OSError('Failed to create symlink to %s. Notice that only NTFS '
- 'version 5.0 and up has all the needed APIs for '
- 'creating symlinks.' % source_path)
- os.symlink = symlink
-
-
-class WebRTCLinkSetup(object):
- def __init__(self, links_db, force=False, dry_run=False, prompt=False):
- self._force = force
- self._dry_run = dry_run
- self._prompt = prompt
- self._links_db = links_db
-
- def CreateLinks(self, on_bot):
- logging.debug('CreateLinks')
- # First, make a plan of action
- actions = []
-
- for source_path, link_path in FILES.iteritems():
- actions += self._ActionForPath(
- source_path, link_path, check_fn=os.path.isfile, check_msg='files')
- for source_dir in DIRECTORIES:
- actions += self._ActionForPath(
- source_dir, None, check_fn=os.path.isdir,
- check_msg='directories')
-
- if not on_bot and self._force:
- # When making the manual switch from legacy SVN checkouts to the new
- # Git-based Chromium DEPS, the .gclient_entries file that contains cached
- # URLs for all DEPS entries must be removed to avoid future sync problems.
- entries_file = os.path.join(os.path.dirname(ROOT_DIR), '.gclient_entries')
- if os.path.exists(entries_file):
- actions.append(Remove(entries_file, dangerous=True))
-
- actions.sort()
-
- if self._dry_run:
- for action in actions:
- action.announce(planning=True)
- logging.info('Not doing anything because dry-run was specified.')
- sys.exit(0)
-
- if any(a.dangerous for a in actions):
- logging.warn('Dangerous actions:')
- for action in (a for a in actions if a.dangerous):
- action.announce(planning=True)
- print
-
- if not self._force:
- logging.error(textwrap.dedent("""\
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- A C T I O N R E Q I R E D
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- Because chromium/src is transitioning to Git (from SVN), we needed to
- change the way that the WebRTC standalone checkout works. Instead of
- individually syncing subdirectories of Chromium in SVN, we're now
- syncing Chromium (and all of its DEPS, as defined by its own DEPS file),
- into the `chromium/src` directory.
-
- As such, all Chromium directories which are currently pulled by DEPS are
- now replaced with a symlink into the full Chromium checkout.
-
- To avoid disrupting developers, we've chosen to not delete your
- directories forcibly, in case you have some work in progress in one of
- them :).
-
- ACTION REQUIRED:
- Before running `gclient sync|runhooks` again, you must run:
- %s%s --force
-
- Which will replace all directories which now must be symlinks, after
- prompting with a summary of the work-to-be-done.
- """), 'python ' if sys.platform.startswith('win') else '', sys.argv[0])
- sys.exit(1)
- elif self._prompt:
- if not query_yes_no('Would you like to perform the above plan?'):
- sys.exit(1)
-
- for action in actions:
- action.announce(planning=False)
- action.doit(self._links_db)
-
- if not on_bot and self._force:
- logging.info('Completed!\n\nNow run `gclient sync|runhooks` again to '
- 'let the remaining hooks (that probably were interrupted) '
- 'execute.')
-
- def CleanupLinks(self):
- logging.debug('CleanupLinks')
- for source, link_path in self._links_db.iteritems():
- if source == 'SCHEMA_VERSION':
- continue
- if os.path.islink(link_path) or sys.platform.startswith('win'):
- # os.path.islink() always returns false on Windows
- # See http://bugs.python.org/issue13143.
- logging.debug('Removing link to %s at %s', source, link_path)
- if not self._dry_run:
- if os.path.exists(link_path):
- if sys.platform.startswith('win') and os.path.isdir(link_path):
- subprocess.check_call(['rmdir', '/q', '/s', link_path],
- shell=True)
- else:
- os.remove(link_path)
- del self._links_db[source]
-
- @staticmethod
- def _ActionForPath(source_path, link_path=None, check_fn=None,
- check_msg=None):
- """Create zero or more Actions to link to a file or directory.
-
- This will be a symlink on POSIX platforms. On Windows this requires
- that NTFS is version 5.0 or higher (Vista or newer).
-
- Args:
- source_path: Path relative to the Chromium checkout root.
- For readability, the path may contain slashes, which will
- automatically be converted to the right path delimiter on Windows.
- link_path: The location for the link to create. If omitted it will be the
- same path as source_path.
- check_fn: A function returning true if the type of filesystem object is
- correct for the attempted call. Otherwise an error message with
- check_msg will be printed.
- check_msg: String used to inform the user of an invalid attempt to create
- a file.
- Returns:
- A list of Action objects.
- """
- def fix_separators(path):
- if sys.platform.startswith('win'):
- return path.replace(os.altsep, os.sep)
- else:
- return path
-
- assert check_fn
- assert check_msg
- link_path = link_path or source_path
- link_path = fix_separators(link_path)
-
- source_path = fix_separators(source_path)
- source_path = os.path.join(CHROMIUM_CHECKOUT, source_path)
- if os.path.exists(source_path) and not check_fn:
- raise LinkError('_LinkChromiumPath can only be used to link to %s: '
- 'Tried to link to: %s' % (check_msg, source_path))
-
- if not os.path.exists(source_path):
- logging.debug('Silently ignoring missing source: %s. This is to avoid '
- 'errors on platform-specific dependencies.', source_path)
- return []
-
- actions = []
-
- if os.path.exists(link_path) or os.path.islink(link_path):
- if os.path.islink(link_path):
- actions.append(Remove(link_path, dangerous=False))
- elif os.path.isfile(link_path):
- actions.append(Remove(link_path, dangerous=True))
- elif os.path.isdir(link_path):
- actions.append(Rmtree(link_path))
- else:
- raise LinkError('Don\'t know how to plan: %s' % link_path)
-
- # Create parent directories to the target link if needed.
- target_parent_dirs = os.path.dirname(link_path)
- if (target_parent_dirs and
- target_parent_dirs != link_path and
- not os.path.exists(target_parent_dirs)):
- actions.append(Makedirs(target_parent_dirs))
-
- actions.append(Symlink(source_path, link_path))
-
- return actions
-
-def _initialize_database(filename):
- links_database = shelve.open(filename)
-
- # Wipe the database if this version of the script ends up looking at a
- # newer (future) version of the links db, just to be sure.
- version = links_database.get('SCHEMA_VERSION')
- if version and version != SCHEMA_VERSION:
- logging.info('Found database with schema version %s while this script only '
- 'supports %s. Wiping previous database contents.', version,
- SCHEMA_VERSION)
- links_database.clear()
- links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
- return links_database
-
-
-def main():
- on_bot = os.environ.get('CHROME_HEADLESS') == '1'
-
- parser = optparse.OptionParser()
- parser.add_option('-d', '--dry-run', action='store_true', default=False,
- help='Print what would be done, but don\'t perform any '
- 'operations. This will automatically set logging to '
- 'verbose.')
- parser.add_option('-c', '--clean-only', action='store_true', default=False,
- help='Only clean previously created links, don\'t create '
- 'new ones. This will automatically set logging to '
- 'verbose.')
- parser.add_option('-f', '--force', action='store_true', default=on_bot,
- help='Force link creation. CAUTION: This deletes existing '
- 'folders and files in the locations where links are '
- 'about to be created.')
- parser.add_option('-n', '--no-prompt', action='store_false', dest='prompt',
- default=(not on_bot),
- help='Prompt if we\'re planning to do a dangerous action')
- parser.add_option('-v', '--verbose', action='store_const',
- const=logging.DEBUG, default=logging.INFO,
- help='Print verbose output for debugging.')
- options, _ = parser.parse_args()
-
- if options.dry_run or options.force or options.clean_only:
- options.verbose = logging.DEBUG
- logging.basicConfig(format='%(message)s', level=options.verbose)
-
- # Work from the root directory of the checkout.
- script_dir = os.path.dirname(os.path.abspath(__file__))
- os.chdir(script_dir)
-
- if sys.platform.startswith('win'):
- def is_admin():
- try:
- return os.getuid() == 0
- except AttributeError:
- return ctypes.windll.shell32.IsUserAnAdmin() != 0
- if not is_admin():
- logging.error('On Windows, you now need to have administrator '
- 'privileges for the shell running %s (or '
- '`gclient sync|runhooks`).\nPlease start another command '
- 'prompt as Administrator and try again.', sys.argv[0])
- return 1
-
- if not os.path.exists(CHROMIUM_CHECKOUT):
- logging.error('Cannot find a Chromium checkout at %s. Did you run "gclient '
- 'sync" before running this script?', CHROMIUM_CHECKOUT)
- return 2
-
- links_database = _initialize_database(LINKS_DB)
- try:
- symlink_creator = WebRTCLinkSetup(links_database, options.force,
- options.dry_run, options.prompt)
- symlink_creator.CleanupLinks()
- if not options.clean_only:
- symlink_creator.CreateLinks(on_bot)
- except LinkError as e:
- print >> sys.stderr, e.message
- return 3
- finally:
- links_database.close()
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
deleted file mode 100644
index 676527c1..00000000
--- a/files/source/compare_gcc.cc
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-#if defined(__x86_64__)
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint64_t diff = 0u;
-
- asm volatile(
- "xor %3,%3 \n"
- "xor %%r8,%%r8 \n"
- "xor %%r9,%%r9 \n"
- "xor %%r10,%%r10 \n"
-
- // Process 32 bytes per loop.
- LABELALIGN
- "1: \n"
- "mov (%0),%%rcx \n"
- "mov 0x8(%0),%%rdx \n"
- "xor (%1),%%rcx \n"
- "xor 0x8(%1),%%rdx \n"
- "popcnt %%rcx,%%rcx \n"
- "popcnt %%rdx,%%rdx \n"
- "mov 0x10(%0),%%rsi \n"
- "mov 0x18(%0),%%rdi \n"
- "xor 0x10(%1),%%rsi \n"
- "xor 0x18(%1),%%rdi \n"
- "popcnt %%rsi,%%rsi \n"
- "popcnt %%rdi,%%rdi \n"
- "add $0x20,%0 \n"
- "add $0x20,%1 \n"
- "add %%rcx,%3 \n"
- "add %%rdx,%%r8 \n"
- "add %%rsi,%%r9 \n"
- "add %%rdi,%%r10 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
-
- "add %%r8, %3 \n"
- "add %%r9, %3 \n"
- "add %%r10, %3 \n"
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=r"(diff) // %3
- :
- : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
-
- return static_cast<uint32_t>(diff);
-}
-#else
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff = 0u;
-
- asm volatile(
- // Process 16 bytes per loop.
- LABELALIGN
- "1: \n"
- "mov (%0),%%ecx \n"
- "mov 0x4(%0),%%edx \n"
- "xor (%1),%%ecx \n"
- "xor 0x4(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "mov 0x8(%0),%%ecx \n"
- "mov 0xc(%0),%%edx \n"
- "xor 0x8(%1),%%ecx \n"
- "xor 0xc(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "add $0x10,%0 \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "+r"(diff) // %3
- :
- : "memory", "cc", "ecx", "edx");
-
- return diff;
-}
-#endif
-
-static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
- 15, 15, 15, 15, 15, 15, 15, 15};
-static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff = 0u;
-
- asm volatile(
- "movdqa %4,%%xmm2 \n"
- "movdqa %5,%%xmm3 \n"
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa 0x10(%0), %%xmm5 \n"
- "pxor (%0,%1), %%xmm4 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pand %%xmm2,%%xmm6 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm6,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "paddb %%xmm7,%%xmm6 \n"
- "pxor 0x10(%0,%1),%%xmm5 \n"
- "add $0x20,%0 \n"
- "movdqa %%xmm5,%%xmm4 \n"
- "pand %%xmm2,%%xmm5 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm5,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm5 \n"
- "pshufb %%xmm4,%%xmm5 \n"
- "paddb %%xmm7,%%xmm5 \n"
- "paddb %%xmm5,%%xmm6 \n"
- "psadbw %%xmm1,%%xmm6 \n"
- "paddd %%xmm6,%%xmm0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
-
- "pshufd $0xaa,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0, %3 \n"
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=r"(diff) // %3
- : "m"(kNibbleMask), // %4
- "m"(kBitCount) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-
- return diff;
-}
-
-#ifdef HAS_HAMMINGDISTANCE_AVX2
-uint32_t HammingDistance_AVX2(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff = 0u;
-
- asm volatile(
- "vbroadcastf128 %4,%%ymm2 \n"
- "vbroadcastf128 %5,%%ymm3 \n"
- "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqa (%0),%%ymm4 \n"
- "vmovdqa 0x20(%0), %%ymm5 \n"
- "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
- "vpand %%ymm2,%%ymm4,%%ymm6 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
- "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
- "add $0x40,%0 \n"
- "vpand %%ymm2,%%ymm4,%%ymm5 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
- "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
- "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
-
- "vpermq $0xb1,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xaa,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovd %%xmm0, %3 \n"
- "vzeroupper \n"
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=r"(diff) // %3
- : "m"(kNibbleMask), // %4
- "m"(kBitCount) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-
- return diff;
-}
-#endif // HAS_HAMMINGDISTANCE_AVX2
-
-uint32_t SumSquareError_SSE2(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t sse;
- asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
-
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
-
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
- return sse;
-}
-
-static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
-static const uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
- uint32_t hash;
- asm volatile(
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "sub $0x10,%1 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
- : "+r"(src), // %0
- "+r"(count), // %1
- "+rm"(seed), // %2
- "=g"(hash) // %3
- : "m"(kHash16x33), // %4
- "m"(kHashMul0), // %5
- "m"(kHashMul1), // %6
- "m"(kHashMul2), // %7
- "m"(kHashMul3) // %8
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
- return hash;
-}
-#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc
deleted file mode 100644
index 7640d946..00000000
--- a/files/source/compare_mmi.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff = 0u;
-
- uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
- uint64_t c1 = 0x5555555555555555;
- uint64_t c2 = 0x3333333333333333;
- uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
- uint32_t c4 = 0x01010101;
- uint64_t s1 = 1, s2 = 2, s3 = 4;
- __asm__ volatile(
- "1: \n\t"
- "ldc1 %[ta], 0(%[src_a]) \n\t"
- "ldc1 %[tb], 0(%[src_b]) \n\t"
- "xor %[temp], %[ta], %[tb] \n\t"
- "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1
- "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1
- "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1
- "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2)
- "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2
- "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2
- "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t
- "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4
- "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4)
- "and %[temp1], %[temp1], %[c3] \n\t" //&c3
- "dmfc1 $t0, %[temp1] \n\t"
- "dsrl32 $t0, $t0, 0 \n\t "
- "mul $t0, $t0, %[c4] \n\t"
- "dsrl $t0, $t0, 24 \n\t"
- "dadd %[diff], %[diff], $t0 \n\t"
- "dmfc1 $t0, %[temp1] \n\t"
- "mul $t0, $t0, %[c4] \n\t"
- "dsrl $t0, $t0, 24 \n\t"
- "dadd %[diff], %[diff], $t0 \n\t"
- "daddiu %[src_a], %[src_a], 8 \n\t"
- "daddiu %[src_b], %[src_b], 8 \n\t"
- "addiu %[count], %[count], -8 \n\t"
- "bgtz %[count], 1b \n\t"
- "nop \n\t"
- : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
- [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
- [temp1] "+f"(temp1)
- : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
- [s2] "f"(s2), [s3] "f"(s3)
- : "memory");
- return diff;
-}
-
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t sse = 0u;
- uint32_t sse_hi = 0u, sse_lo = 0u;
-
- uint64_t src1, src2;
- uint64_t diff, diff_hi, diff_lo;
- uint64_t sse_sum, sse_tmp;
-
- const uint64_t mask = 0x0ULL;
-
- __asm__ volatile(
- "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t"
-
- "1: \n\t"
- "ldc1 %[src1], 0x00(%[src_a]) \n\t"
- "ldc1 %[src2], 0x00(%[src_b]) \n\t"
- "pasubub %[diff], %[src1], %[src2] \n\t"
- "punpcklbh %[diff_lo], %[diff], %[mask] \n\t"
- "punpckhbh %[diff_hi], %[diff], %[mask] \n\t"
- "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t"
- "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t"
- "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t"
- "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t"
-
- "daddiu %[src_a], %[src_a], 0x08 \n\t"
- "daddiu %[src_b], %[src_b], 0x08 \n\t"
- "daddiu %[count], %[count], -0x08 \n\t"
- "bnez %[count], 1b \n\t"
-
- "mfc1 %[sse_lo], %[sse_sum] \n\t"
- "mfhc1 %[sse_hi], %[sse_sum] \n\t"
- "daddu %[sse], %[sse_hi], %[sse_lo] \n\t"
- : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
- [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
- [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
- [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
- : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
- [mask] "f"(mask)
- : "memory");
-
- return sse;
-}
-
-#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
deleted file mode 100644
index 2a2181e0..00000000
--- a/files/source/compare_neon.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
- !defined(__aarch64__)
-
-// 256 bits at a time
-// uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff;
-
- asm volatile(
- "vmov.u16 q4, #0 \n" // accumulator
-
- "1: \n"
- "vld1.8 {q0, q1}, [%0]! \n"
- "vld1.8 {q2, q3}, [%1]! \n"
- "veor.32 q0, q0, q2 \n"
- "veor.32 q1, q1, q3 \n"
- "vcnt.i8 q0, q0 \n"
- "vcnt.i8 q1, q1 \n"
- "subs %2, %2, #32 \n"
- "vadd.u8 q0, q0, q1 \n" // 16 byte counts
- "vpadal.u8 q4, q0 \n" // 8 shorts
- "bgt 1b \n"
-
- "vpaddl.u16 q0, q4 \n" // 4 ints
- "vpadd.u32 d0, d0, d1 \n"
- "vpadd.u32 d0, d0, d0 \n"
- "vmov.32 %3, d0[0] \n"
-
- : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
- :
- : "cc", "q0", "q1", "q2", "q3", "q4");
- return diff;
-}
-
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t sse;
- asm volatile(
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
-
- "1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
-
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
- : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
- return sse;
-}
-
-#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc
deleted file mode 100644
index 6e8f672a..00000000
--- a/files/source/compare_neon64.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// 256 bits at a time
-// uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff;
- asm volatile(
- "movi v4.8h, #0 \n"
-
- "1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
- "eor v0.16b, v0.16b, v2.16b \n"
- "eor v1.16b, v1.16b, v3.16b \n"
- "cnt v0.16b, v0.16b \n"
- "cnt v1.16b, v1.16b \n"
- "subs %w2, %w2, #32 \n"
- "add v0.16b, v0.16b, v1.16b \n"
- "uadalp v4.8h, v0.16b \n"
- "b.gt 1b \n"
-
- "uaddlv s4, v4.8h \n"
- "fmov %w3, s4 \n"
- : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
- :
- : "cc", "v0", "v1", "v2", "v3", "v4");
- return diff;
-}
-
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t sse;
- asm volatile(
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
-
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
-
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
- : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
- :
- : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
- return sse;
-}
-
-#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/convert.cc b/files/source/convert.cc
deleted file mode 100644
index 614fa482..00000000
--- a/files/source/convert.cc
+++ /dev/null
@@ -1,2576 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h" // For ScalePlane()
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int src_y_width,
- int src_y_height,
- int src_uv_width,
- int src_uv_height) {
- const int dst_y_width = Abs(src_y_width);
- const int dst_y_height = Abs(src_y_height);
- const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
- const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
- if (src_uv_width == 0 || src_uv_height == 0) {
- return -1;
- }
- if (dst_y) {
- ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
- dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
- }
- ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
- dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
- ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
- dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
- return 0;
-}
-
-// Copy I420 with optional flipping.
-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
-// is does row coalescing.
-LIBYUV_API
-int I420Copy(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- // Copy UV planes.
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
- return 0;
-}
-
-// Copy I010 with optional flipping.
-LIBYUV_API
-int I010Copy(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint16_t* dst_y,
- int dst_stride_y,
- uint16_t* dst_u,
- int dst_stride_u,
- uint16_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- if (dst_y) {
- CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- // Copy UV planes.
- CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
- return 0;
-}
-
-// Convert 10 bit YUV to 8 bit.
-LIBYUV_API
-int I010ToI420(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- // Convert Y plane.
- Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
- height);
- // Convert UV planes.
- Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
- halfheight);
- Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
- halfheight);
- return 0;
-}
-
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I422ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- const int src_uv_width = SUBSAMPLE(width, 1, 1);
- return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, width, height, src_uv_width, height);
-}
-
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- // Allocate u and v buffers
- align_buffer_64(plane_u, halfwidth * halfheight * 2);
- uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
- I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
- dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
- height);
- MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
- halfwidth, halfheight);
- free_aligned_buffer_64(plane_u);
- return 0;
-}
-
-#ifdef I422TONV21_ROW_VERSION
-// Unittest fails for this version.
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-// Swap src_u and src_v to implement I422ToNV12
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- int y;
- void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
- uint8_t* dst_uv, int width) = MergeUVRow_C;
- void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) = InterpolateRow_C;
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- MergeUVRow = MergeUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow = MergeUVRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- MergeUVRow = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- MergeUVRow = MergeUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MergeUVRow = MergeUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow = MergeUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MergeUVRow = MergeUVRow_Any_MSA;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow = MergeUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow = MergeUVRow_MMI;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
- }
- }
-#endif
-
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
- }
- {
- // Allocate 2 rows of vu.
- int awidth = halfwidth * 2;
- align_buffer_64(row_vu_0, awidth * 2);
- uint8_t* row_vu_1 = row_vu_0 + awidth;
-
- for (y = 0; y < height - 1; y += 2) {
- MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
- MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
- halfwidth);
- InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
- src_u += src_stride_u * 2;
- src_v += src_stride_v * 2;
- dst_vu += dst_stride_vu;
- }
- if (height & 1) {
- MergeUVRow(src_v, src_u, dst_vu, halfwidth);
- }
- free_aligned_buffer_64(row_vu_0);
- }
- return 0;
-}
-#endif // I422TONV21_ROW_VERSION
-
-// 444 chroma is 1x width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I444ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, width, height, width, height);
-}
-
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- // Allocate u and v buffers
- align_buffer_64(plane_u, halfwidth * halfheight * 2);
- uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
- I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
- dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
- height);
- MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
- halfwidth, halfheight);
- free_aligned_buffer_64(plane_u);
- return 0;
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToI420(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
- SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
- return 0;
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToNV21(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!dst_vu || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
- return 0;
-}
-
-static void CopyPlane2(const uint8_t* src,
- int src_stride_0,
- int src_stride_1,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- int y;
- void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_AVX)
- if (TestCpuFlag(kCpuHasAVX)) {
- CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src, dst, width);
- CopyRow(src + src_stride_0, dst + dst_stride, width);
- src += src_stride_0 + src_stride_1;
- dst += dst_stride * 2;
- }
- if (height & 1) {
- CopyRow(src, dst, width);
- }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-// The UV plane is half width, but 2 values, so src_stride_m420 applies to
-// this as well as the two Y planes.
-static int X420ToI420(const uint8_t* src_y,
- int src_stride_y0,
- int src_stride_y1,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- if (dst_y) {
- dst_y = dst_y + (height - 1) * dst_stride_y;
- }
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- // Coalesce rows.
- if (src_stride_y0 == width && src_stride_y1 == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
- }
- // Coalesce rows.
- if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
- dst_stride_v == halfwidth) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_uv = dst_stride_u = dst_stride_v = 0;
- }
-
- if (dst_y) {
- if (src_stride_y0 == src_stride_y1) {
- CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
- } else {
- CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
- }
- }
-
- // Split UV plane - NV12 / NV21
- SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
- halfwidth, halfheight);
-
- return 0;
-}
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
- dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
- dst_stride_v, width, height);
-}
-
-// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
-LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
- dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
- dst_stride_u, width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
- dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
- width, height);
-}
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8_t* src_yuy2,
- int src_stride_yuy2,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- YUY2ToUVRow_C;
- void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
- YUY2ToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
- src_stride_yuy2 = -src_stride_yuy2;
- }
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_SSE2;
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
- YUY2ToYRow = YUY2ToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToUVRow = YUY2ToUVRow_AVX2;
- YUY2ToYRow = YUY2ToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- YUY2ToUVRow = YUY2ToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- YUY2ToYRow = YUY2ToYRow_Any_MSA;
- YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToYRow = YUY2ToYRow_MSA;
- YUY2ToUVRow = YUY2ToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- YUY2ToYRow = YUY2ToYRow_Any_MMI;
- YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_MMI;
- }
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
- src_yuy2 += src_stride_yuy2 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- }
- return 0;
-}
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8_t* src_uyvy,
- int src_stride_uyvy,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- UYVYToUVRow_C;
- void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
- UYVYToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
- src_stride_uyvy = -src_stride_uyvy;
- }
-#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- UYVYToUVRow = UYVYToUVRow_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- UYVYToUVRow = UYVYToUVRow_SSE2;
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- UYVYToUVRow = UYVYToUVRow_Any_AVX2;
- UYVYToYRow = UYVYToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- UYVYToUVRow = UYVYToUVRow_AVX2;
- UYVYToYRow = UYVYToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- UYVYToUVRow = UYVYToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_NEON;
- UYVYToUVRow = UYVYToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- UYVYToYRow = UYVYToYRow_Any_MSA;
- UYVYToUVRow = UYVYToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- UYVYToYRow = UYVYToYRow_MSA;
- UYVYToUVRow = UYVYToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- UYVYToYRow = UYVYToYRow_Any_MMI;
- UYVYToUVRow = UYVYToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_MMI;
- UYVYToUVRow = UYVYToUVRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
- UYVYToYRow(src_uyvy, dst_y, width);
- UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
- src_uyvy += src_stride_uyvy * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
- UYVYToYRow(src_uyvy, dst_y, width);
- }
- return 0;
-}
-
-// Convert AYUV to NV12.
-LIBYUV_API
-int AYUVToNV12(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_uv,
- int dst_stride_uv,
- int width,
- int height) {
- int y;
- void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
- uint8_t* dst_uv, int width) = AYUVToUVRow_C;
- void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
- AYUVToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
- src_stride_ayuv = -src_stride_ayuv;
- }
-// place holders for future intel code
-#if defined(HAS_AYUVTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- AYUVToUVRow = AYUVToUVRow_Any_SSE2;
- AYUVToYRow = AYUVToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- AYUVToUVRow = AYUVToUVRow_SSE2;
- AYUVToYRow = AYUVToYRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_AYUVTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- AYUVToUVRow = AYUVToUVRow_Any_AVX2;
- AYUVToYRow = AYUVToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- AYUVToUVRow = AYUVToUVRow_AVX2;
- AYUVToYRow = AYUVToYRow_AVX2;
- }
- }
-#endif
-
-#if defined(HAS_AYUVTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- AYUVToYRow = AYUVToYRow_Any_NEON;
- AYUVToUVRow = AYUVToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- AYUVToYRow = AYUVToYRow_NEON;
- AYUVToUVRow = AYUVToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
- AYUVToYRow(src_ayuv, dst_y, width);
- AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
- src_ayuv += src_stride_ayuv * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
- }
- if (height & 1) {
- AYUVToUVRow(src_ayuv, 0, dst_uv, width);
- AYUVToYRow(src_ayuv, dst_y, width);
- }
- return 0;
-}
-
-// Convert AYUV to NV21.
-LIBYUV_API
-int AYUVToNV21(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- int y;
- void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
- uint8_t* dst_vu, int width) = AYUVToVURow_C;
- void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
- AYUVToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
- src_stride_ayuv = -src_stride_ayuv;
- }
-// place holders for future intel code
-#if defined(HAS_AYUVTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- AYUVToVURow = AYUVToVURow_Any_SSE2;
- AYUVToYRow = AYUVToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- AYUVToVURow = AYUVToVURow_SSE2;
- AYUVToYRow = AYUVToYRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_AYUVTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- AYUVToVURow = AYUVToVURow_Any_AVX2;
- AYUVToYRow = AYUVToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- AYUVToVURow = AYUVToVURow_AVX2;
- AYUVToYRow = AYUVToYRow_AVX2;
- }
- }
-#endif
-
-#if defined(HAS_AYUVTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- AYUVToYRow = AYUVToYRow_Any_NEON;
- AYUVToVURow = AYUVToVURow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- AYUVToYRow = AYUVToYRow_NEON;
- AYUVToVURow = AYUVToVURow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
- AYUVToYRow(src_ayuv, dst_y, width);
- AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
- src_ayuv += src_stride_ayuv * 2;
- dst_y += dst_stride_y * 2;
- dst_vu += dst_stride_vu;
- }
- if (height & 1) {
- AYUVToVURow(src_ayuv, 0, dst_vu, width);
- AYUVToYRow(src_ayuv, dst_y, width);
- }
- return 0;
-}
-
-// Convert ARGB to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYRow_C;
- if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- }
- return 0;
-}
-
-// Convert BGRA to I420.
-LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- BGRAToUVRow_C;
- void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
- BGRAToYRow_C;
- if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_bgra = src_bgra + (height - 1) * src_stride_bgra;
- src_stride_bgra = -src_stride_bgra;
- }
-#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
- BGRAToYRow = BGRAToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_SSSE3;
- BGRAToYRow = BGRAToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_BGRATOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- BGRAToYRow = BGRAToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- BGRAToYRow = BGRAToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- BGRAToUVRow = BGRAToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_BGRATOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- BGRAToYRow = BGRAToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- BGRAToYRow = BGRAToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- BGRAToUVRow = BGRAToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_BGRATOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BGRAToYRow = BGRAToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- BGRAToYRow = BGRAToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BGRAToUVRow = BGRAToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
- BGRAToYRow(src_bgra, dst_y, width);
- BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
- src_bgra += src_stride_bgra * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
- BGRAToYRow(src_bgra, dst_y, width);
- }
- return 0;
-}
-
-// Convert ABGR to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ABGRToUVRow_C;
- void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
- ABGRToYRow_C;
- if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_abgr = src_abgr + (height - 1) * src_stride_abgr;
- src_stride_abgr = -src_stride_abgr;
- }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
- ABGRToYRow = ABGRToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
- ABGRToYRow = ABGRToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ABGRTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ABGRToYRow = ABGRToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ABGRToUVRow = ABGRToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ABGRTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToYRow = ABGRToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToYRow = ABGRToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToUVRow = ABGRToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToYRow = ABGRToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
- ABGRToYRow(src_abgr, dst_y, width);
- ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
- src_abgr += src_stride_abgr * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
- ABGRToYRow(src_abgr, dst_y, width);
- }
- return 0;
-}
-
-// Convert RGBA to I420.
-LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
- int src_stride_rgba,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- RGBAToUVRow_C;
- void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
- RGBAToYRow_C;
- if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgba = src_rgba + (height - 1) * src_stride_rgba;
- src_stride_rgba = -src_stride_rgba;
- }
-#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
- RGBAToYRow = RGBAToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_SSSE3;
- RGBAToYRow = RGBAToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_RGBATOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGBAToYRow = RGBAToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGBAToYRow = RGBAToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGBAToUVRow = RGBAToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGBATOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGBAToYRow = RGBAToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGBAToYRow = RGBAToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGBAToUVRow = RGBAToUVRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGBATOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGBAToYRow = RGBAToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGBAToYRow = RGBAToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGBAToUVRow = RGBAToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
- RGBAToYRow(src_rgba, dst_y, width);
- RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
- src_rgba += src_stride_rgba * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
- RGBAToYRow(src_rgba, dst_y, width);
- }
- return 0;
-}
-
-// Convert RGB24 to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
- void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- RGB24ToUVRow_C;
- void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
- RGB24ToYRow_C;
-#else
- void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RGB24ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYRow_C;
-#endif
- if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
- src_stride_rgb24 = -src_stride_rgb24;
- }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
- RGB24ToYRow = RGB24ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYRow = RGB24ToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVRow = RGB24ToUVRow_NEON;
- }
- }
- }
-#elif defined(HAS_RGB24TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
- RGB24ToYRow = RGB24ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToYRow = RGB24ToYRow_MSA;
- RGB24ToUVRow = RGB24ToUVRow_MSA;
- }
- }
-#elif defined(HAS_RGB24TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
- RGB24ToYRow = RGB24ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYRow = RGB24ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVRow = RGB24ToUVRow_MMI;
- }
- }
- }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#endif
-
- {
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
- RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
- RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
- RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// TODO(fbarchard): Use Matrix version to implement I420 and J420.
-// Convert RGB24 to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- RGB24ToUVJRow_C;
- void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
- RGB24ToYJRow_C;
-#else
- void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RGB24ToARGBRow_C;
- void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVJRow_C;
- void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYJRow_C;
-#endif
- if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
- src_stride_rgb24 = -src_stride_rgb24;
- }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
- RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYJRow = RGB24ToYJRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVJRow = RGB24ToUVJRow_NEON;
- }
- }
- }
-#elif defined(HAS_RGB24TOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
- RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToYJRow = RGB24ToYJRow_MSA;
- RGB24ToUVJRow = RGB24ToUVJRow_MSA;
- }
- }
-#elif defined(HAS_RGB24TOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
- RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYJRow = RGB24ToYJRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVJRow = RGB24ToUVJRow_MMI;
- }
- }
- }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
- ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
- ARGBToYJRow = ARGBToYJRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVJRow = ARGBToUVJRow_AVX2;
- ARGBToYJRow = ARGBToYJRow_AVX2;
- }
- }
-#endif
-#endif
-
- {
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
- RGB24ToYJRow(src_rgb24, dst_y, width);
- RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYJRow(row, dst_y, width);
- ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
- RGB24ToYJRow(src_rgb24, dst_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToUVJRow(row, 0, dst_u, dst_v, width);
- ARGBToYJRow(row, dst_y, width);
-#endif
- }
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert RAW to I420.
-LIBYUV_API
-int RAWToI420(const uint8_t* src_raw,
- int src_stride_raw,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
- void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
- uint8_t* dst_v, int width) = RAWToUVRow_C;
- void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
- RAWToYRow_C;
-#else
- void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RAWToARGBRow_C;
- void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYRow_C;
-#endif
- if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
-
-// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RAWToUVRow = RAWToUVRow_Any_NEON;
- RAWToYRow = RAWToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RAWToYRow = RAWToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RAWToUVRow = RAWToUVRow_NEON;
- }
- }
- }
-#elif defined(HAS_RAWTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RAWToUVRow = RAWToUVRow_Any_MSA;
- RAWToYRow = RAWToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RAWToYRow = RAWToYRow_MSA;
- RAWToUVRow = RAWToUVRow_MSA;
- }
- }
-#elif defined(HAS_RAWTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RAWToUVRow = RAWToUVRow_Any_MMI;
- RAWToYRow = RAWToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RAWToYRow = RAWToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RAWToUVRow = RAWToUVRow_MMI;
- }
- }
- }
-// Other platforms do intermediate conversion from RAW to ARGB.
-#else
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#endif
-
- {
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
- RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
- RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
- RAWToARGBRow(src_raw, row, width);
- RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_raw += src_stride_raw * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
- RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
-#else
- RAWToARGBRow(src_raw, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert RGB565 to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
- void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- RGB565ToUVRow_C;
- void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
- RGB565ToYRow_C;
-#else
- void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
- int width) = RGB565ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYRow_C;
-#endif
- if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
- src_stride_rgb565 = -src_stride_rgb565;
- }
-
-// Neon version does direct RGB565 to YUV.
-#if defined(HAS_RGB565TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
- RGB565ToYRow = RGB565ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToYRow = RGB565ToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToUVRow = RGB565ToUVRow_NEON;
- }
- }
- }
-#elif defined(HAS_RGB565TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
- RGB565ToYRow = RGB565ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToYRow = RGB565ToYRow_MSA;
- RGB565ToUVRow = RGB565ToUVRow_MSA;
- }
- }
-#elif defined(HAS_RGB565TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
- RGB565ToYRow = RGB565ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToYRow = RGB565ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToUVRow = RGB565ToUVRow_MMI;
- }
- }
- }
-// Other platforms do intermediate conversion from RGB565 to ARGB.
-#else
-#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#endif
- {
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
- RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
- RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
- RGB565ToARGBRow(src_rgb565, row, width);
- RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb565 += src_stride_rgb565 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
- RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
-#else
- RGB565ToARGBRow(src_rgb565, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert ARGB1555 to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
- void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGB1555ToUVRow_C;
- void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
- int width) = ARGB1555ToYRow_C;
-#else
- void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
- int width) = ARGB1555ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYRow_C;
-#endif
- if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
- src_stride_argb1555 = -src_stride_argb1555;
- }
-
-// Neon version does direct ARGB1555 to YUV.
-#if defined(HAS_ARGB1555TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
- ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToYRow = ARGB1555ToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
- }
- }
- }
-#elif defined(HAS_ARGB1555TOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
- ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToYRow = ARGB1555ToYRow_MSA;
- ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
- }
- }
-#elif defined(HAS_ARGB1555TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
- ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToYRow = ARGB1555ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
- }
- }
- }
-// Other platforms do intermediate conversion from ARGB1555 to ARGB.
-#else
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#endif
- {
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
- ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
- ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
- width);
-#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_argb1555 += src_stride_argb1555 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
- ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
-#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert ARGB4444 to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
- void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGB4444ToUVRow_C;
- void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
- int width) = ARGB4444ToYRow_C;
-#else
- void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
- int width) = ARGB4444ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
- ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
- ARGBToYRow_C;
-#endif
- if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
- src_stride_argb4444 = -src_stride_argb4444;
- }
-
-// Neon version does direct ARGB4444 to YUV.
-#if defined(HAS_ARGB4444TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
- ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToYRow = ARGB4444ToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
- }
- }
- }
-#elif defined(HAS_ARGB4444TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
- ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToYRow = ARGB4444ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
- }
- }
- }
-// Other platforms do intermediate conversion from ARGB4444 to ARGB.
-#else
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
- ARGBToYRow = ARGBToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
- }
- }
- }
-#endif
-#endif
-
- {
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
- ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
- ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
- width);
-#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_argb4444 += src_stride_argb4444 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
- ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert RGB24 to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_yj,
- int dst_stride_yj,
- int width,
- int height) {
- int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
- RGB24ToYJRow_C;
-#else
- void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RGB24ToARGBRow_C;
- void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
- ARGBToYJRow_C;
-#endif
- if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
- src_stride_rgb24 = -src_stride_rgb24;
- }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYJRow = RGB24ToYJRow_NEON;
- }
- }
-#elif defined(HAS_RGB24TOYJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToYJRow = RGB24ToYJRow_MSA;
- }
- }
-#elif defined(HAS_RGB24TOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYJRow = RGB24ToYJRow_MMI;
- }
- }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToYJRow = ARGBToYJRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToYJRow = ARGBToYJRow_AVX2;
- }
- }
-#endif
-#endif
-
- {
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToYJRow(src_rgb24, dst_yj, width);
- RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToYJRow(row, dst_yj, width);
- ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_yj += dst_stride_yj * 2;
- }
- if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToYJRow(src_rgb24, dst_yj, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToYJRow(row, dst_yj, width);
-#endif
- }
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-static void SplitPixels(const uint8_t* src_u,
- int src_pixel_stride_uv,
- uint8_t* dst_u,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- *dst_u = *src_u;
- ++dst_u;
- src_u += src_pixel_stride_uv;
- }
-}
-
-// Convert Android420 to I420.
-LIBYUV_API
-int Android420ToI420(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int y;
- const ptrdiff_t vu_off = src_v - src_u;
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // Copy UV planes as is - I420
- if (src_pixel_stride_uv == 1) {
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
- return 0;
- // Split UV planes - NV21
- }
- if (src_pixel_stride_uv == 2 && vu_off == -1 &&
- src_stride_u == src_stride_v) {
- SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
- halfwidth, halfheight);
- return 0;
- // Split UV planes - NV12
- }
- if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
- SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- }
-
- for (y = 0; y < halfheight; ++y) {
- SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
- SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
deleted file mode 100644
index 54050333..00000000
--- a/files/source/convert_argb.cc
+++ /dev/null
@@ -1,2371 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB with optional flipping
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-
- CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
- height);
- return 0;
-}
-
-// Convert I420 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuJPEGConstants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width && src_stride_u * 2 == width &&
- src_stride_v * 2 == width && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I422ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I422ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuJPEGConstants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I422ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert 10 bit YUV to ARGB with matrix
-// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
-// multiply 10 bit yuv into high bits to allow any number of bits.
-static int I010ToAR30Matrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
- const uint16_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I210ToAR30Row_C;
- if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
- dst_stride_ar30 = -dst_stride_ar30;
- }
-#if defined(HAS_I210TOAR30ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I210ToAR30Row = I210ToAR30Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I210TOAR30ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I210ToAR30Row = I210ToAR30Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I210ToAR30Row = I210ToAR30Row_AVX2;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
- dst_ar30 += dst_stride_ar30;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I010 to AR30.
-LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYuvI601Constants, width, height);
-}
-
-// Convert H010 to AR30.
-LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYuvH709Constants, width, height);
-}
-
-// Convert I010 to AB30.
-LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
- int width,
- int height) {
- return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
- src_stride_u, dst_ab30, dst_stride_ab30,
- &kYvuI601Constants, width, height);
-}
-
-// Convert H010 to AB30.
-LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
- int width,
- int height) {
- return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
- src_stride_u, dst_ab30, dst_stride_ab30,
- &kYvuH709Constants, width, height);
-}
-
-// Convert 10 bit YUV to ARGB with matrix
-static int I010ToARGBMatrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
- const uint16_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I210ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_I210TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I210ToARGBRow = I210ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I210TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I210ToARGBRow = I210ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I210ToARGBRow = I210ToARGBRow_AVX2;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I010 to ABGR.
-LIBYUV_API
-int I010ToABGR(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I010ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H010 to ABGR.
-LIBYUV_API
-int H010ToABGR(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I010ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I444ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I444TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I444ToARGBRow = I444ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I444ToARGBRow = I444ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I444ToARGBRow = I444ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I444ToARGBRow = I444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height,
- int attenuate) {
- int y;
- void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, const uint8_t* a_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) = I422AlphaToARGBRow_C;
- void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
- int width) = ARGBAttenuateRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
- if (IS_ALIGNED(width, 4)) {
- ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
- width);
- if (attenuate) {
- ARGBAttenuateRow(dst_argb, dst_argb, width);
- }
- dst_argb += dst_stride_argb;
- src_a += src_stride_a;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 with Alpha to ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height,
- int attenuate) {
- return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, src_a, src_stride_a, dst_argb,
- dst_stride_argb, &kYuvI601Constants, width,
- height, attenuate);
-}
-
-// Convert I420 with Alpha to ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height,
- int attenuate) {
- return I420AlphaToARGBMatrix(
- src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height, attenuate);
-}
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
- I400ToARGBRow_C;
- if (!src_y || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_argb = 0;
- }
-#if defined(HAS_I400TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- I400ToARGBRow = I400ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I400ToARGBRow = I400ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I400ToARGBRow = I400ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I400TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I400ToARGBRow = I400ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I400TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I400ToARGBRow = I400ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- I400ToARGBRow = I400ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_I400TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I400ToARGBRow = I400ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- }
- return 0;
-}
-
-// Convert J400 to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
- J400ToARGBRow_C;
- if (!src_y || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- // Coalesce rows.
- if (src_stride_y == width && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_argb = 0;
- }
-#if defined(HAS_J400TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- J400ToARGBRow = J400ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- J400ToARGBRow = J400ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- J400ToARGBRow = J400ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- J400ToARGBRow = J400ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_J400TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- J400ToARGBRow = J400ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- J400ToARGBRow = J400ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_J400TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- J400ToARGBRow = J400ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- J400ToARGBRow = J400ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_J400TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- J400ToARGBRow = J400ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- J400ToARGBRow = J400ToARGBRow_MMI;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- J400ToARGBRow(src_y, dst_argb, width);
- src_y += src_stride_y;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
- 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
-
-// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
- 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
-
-// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
- 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
-
-// Convert BGRA to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
-}
-
-// Convert ARGB to BGRA (same as BGRAToARGB).
-LIBYUV_API
-int ARGBToBGRA(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
-}
-
-// Convert ABGR to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
-}
-
-// Convert ARGB to ABGR to (same as ABGRToARGB).
-LIBYUV_API
-int ARGBToABGR(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
-}
-
-// Convert RGBA to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8_t* src_rgba,
- int src_stride_rgba,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
-}
-
-// Convert RGB24 to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RGB24ToARGBRow_C;
- if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
- src_stride_rgb24 = -src_stride_rgb24;
- }
- // Coalesce rows.
- if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_rgb24 = dst_stride_argb = 0;
- }
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_RGB24TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToARGBRow = RGB24ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGB24TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGB24TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RGB24ToARGBRow = RGB24ToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- RGB24ToARGBRow(src_rgb24, dst_argb, width);
- src_rgb24 += src_stride_rgb24;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert RAW to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8_t* src_raw,
- int src_stride_raw,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RAWToARGBRow_C;
- if (!src_raw || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
- // Coalesce rows.
- if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_raw = dst_stride_argb = 0;
- }
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_RAWTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RAWToARGBRow = RAWToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RAWToARGBRow = RAWToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RAWTOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RAWToARGBRow = RAWToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RAWTOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RAWToARGBRow = RAWToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RAWToARGBRow = RAWToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- RAWToARGBRow(src_raw, dst_argb, width);
- src_raw += src_stride_raw;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert RGB565 to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
- int width) = RGB565ToARGBRow_C;
- if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
- src_stride_rgb565 = -src_stride_rgb565;
- }
- // Coalesce rows.
- if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_rgb565 = dst_stride_argb = 0;
- }
-#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_RGB565TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGB565TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGB565TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RGB565ToARGBRow = RGB565ToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- RGB565ToARGBRow(src_rgb565, dst_argb, width);
- src_rgb565 += src_stride_rgb565;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert ARGB1555 to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
- int width) = ARGB1555ToARGBRow_C;
- if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
- src_stride_argb1555 = -src_stride_argb1555;
- }
- // Coalesce rows.
- if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb1555 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
- src_argb1555 += src_stride_argb1555;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert ARGB4444 to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
- int width) = ARGB4444ToARGBRow_C;
- if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
- src_stride_argb4444 = -src_stride_argb4444;
- }
- // Coalesce rows.
- if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb4444 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
- src_argb4444 += src_stride_argb4444;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert AR30 to ARGB.
-LIBYUV_API
-int AR30ToARGB(const uint8_t* src_ar30,
- int src_stride_ar30,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
- src_stride_ar30 = -src_stride_ar30;
- }
- // Coalesce rows.
- if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_ar30 = dst_stride_argb = 0;
- }
- for (y = 0; y < height; ++y) {
- AR30ToARGBRow_C(src_ar30, dst_argb, width);
- src_ar30 += src_stride_ar30;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert AR30 to ABGR.
-LIBYUV_API
-int AR30ToABGR(const uint8_t* src_ar30,
- int src_stride_ar30,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- int y;
- if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
- src_stride_ar30 = -src_stride_ar30;
- }
- // Coalesce rows.
- if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
- width *= height;
- height = 1;
- src_stride_ar30 = dst_stride_abgr = 0;
- }
- for (y = 0; y < height; ++y) {
- AR30ToABGRRow_C(src_ar30, dst_abgr, width);
- src_ar30 += src_stride_ar30;
- dst_abgr += dst_stride_abgr;
- }
- return 0;
-}
-
-// Convert AR30 to AB30.
-LIBYUV_API
-int AR30ToAB30(const uint8_t* src_ar30,
- int src_stride_ar30,
- uint8_t* dst_ab30,
- int dst_stride_ab30,
- int width,
- int height) {
- int y;
- if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
- src_stride_ar30 = -src_stride_ar30;
- }
- // Coalesce rows.
- if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
- width *= height;
- height = 1;
- src_stride_ar30 = dst_stride_ab30 = 0;
- }
- for (y = 0; y < height; ++y) {
- AR30ToAB30Row_C(src_ar30, dst_ab30, width);
- src_ar30 += src_stride_ar30;
- dst_ab30 += dst_stride_ab30;
- }
- return 0;
-}
-
-// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*NV12ToARGBRow)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
- if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
-
-// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*NV21ToARGBRow)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
- if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV21TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV21TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV21ToARGBRow = NV21ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV21TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_NV21TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_vu += src_stride_vu;
- }
- }
- return 0;
-}
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
- dst_stride_argb, &kYuvI601Constants, width, height);
-}
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
- dst_stride_argb, &kYuvI601Constants, width, height);
-}
-
-// Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
-// To swap the UV use NV12 instead of NV21.LIBYUV_API
-LIBYUV_API
-int NV12ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
- dst_stride_abgr, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to ABGR.
-LIBYUV_API
-int NV21ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
- dst_stride_abgr, &kYvuI601Constants, width, height);
-}
-
-// TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix
-static int NV12ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*NV12ToRGB24Row)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
- if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_NV12TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB24Row = NV12ToRGB24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
-
-// Convert NV21 to RGB24 with matrix
-static int NV21ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*NV21ToRGB24Row)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
- if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_NV21TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV21ToRGB24Row = NV21ToRGB24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV21TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV21TORGB24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_vu += src_stride_vu;
- }
- }
- return 0;
-}
-
-// Convert NV12 to RGB24.
-LIBYUV_API
-int NV12ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
- dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
- width, height);
-}
-
-// Convert NV21 to RGB24.
-LIBYUV_API
-int NV21ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
- dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
- width, height);
-}
-
-// Convert NV12 to RAW.
-LIBYUV_API
-int NV12ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
- dst_stride_raw, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to RAW.
-LIBYUV_API
-int NV21ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
- dst_stride_raw, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to YUV24
-int NV21ToYUV24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_yuv24,
- int dst_stride_yuv24,
- int width,
- int height) {
- int y;
- void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
- uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
- if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
- dst_stride_yuv24 = -dst_stride_yuv24;
- }
-#if defined(HAS_NV21TOYUV24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- NV21ToYUV24Row = NV21ToYUV24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV21TOYUV24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
- dst_yuv24 += dst_stride_yuv24;
- src_y += src_stride_y;
- if (y & 1) {
- src_vu += src_stride_vu;
- }
- }
- return 0;
-}
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*NV12ToARGBRow)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
- if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, &kYuvI601Constants, width);
- dst_argb += dst_stride_argb * 2;
- src_m420 += src_stride_m420 * 3;
- }
- if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- }
- return 0;
-}
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8_t* src_yuy2,
- int src_stride_yuy2,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants, int width) =
- YUY2ToARGBRow_C;
- if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
- src_stride_yuy2 = -src_stride_yuy2;
- }
- // Coalesce rows.
- if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_yuy2 = dst_stride_argb = 0;
- }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToARGBRow = YUY2ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToARGBRow = YUY2ToARGBRow_MSA;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
- src_yuy2 += src_stride_yuy2;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8_t* src_uyvy,
- int src_stride_uyvy,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants, int width) =
- UYVYToARGBRow_C;
- if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
- src_stride_uyvy = -src_stride_uyvy;
- }
- // Coalesce rows.
- if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_uyvy = dst_stride_argb = 0;
- }
-#if defined(HAS_UYVYTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_UYVYTOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- UYVYToARGBRow = UYVYToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_UYVYTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- UYVYToARGBRow = UYVYToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_UYVYTOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- UYVYToARGBRow = UYVYToARGBRow_MSA;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
- src_uyvy += src_stride_uyvy;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-static void WeavePixels(const uint8_t* src_u,
- const uint8_t* src_v,
- int src_pixel_stride_uv,
- uint8_t* dst_uv,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- dst_uv[0] = *src_u;
- dst_uv[1] = *src_v;
- dst_uv += 2;
- src_u += src_pixel_stride_uv;
- src_v += src_pixel_stride_uv;
- }
-}
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- uint8_t* dst_uv;
- const ptrdiff_t vu_off = src_v - src_u;
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-
- // I420
- if (src_pixel_stride_uv == 1) {
- return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_argb, dst_stride_argb,
- yuvconstants, width, height);
- // NV21
- }
- if (src_pixel_stride_uv == 2 && vu_off == -1 &&
- src_stride_u == src_stride_v) {
- return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
- dst_stride_argb, yuvconstants, width, height);
- // NV12
- }
- if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
- return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
- dst_stride_argb, yuvconstants, width, height);
- }
-
- // General case fallback creates NV12
- align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
- dst_uv = plane_uv;
- for (y = 0; y < halfheight; ++y) {
- WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uv += halfwidth * 2;
- }
- NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
- dst_stride_argb, yuvconstants, width, height);
- free_aligned_buffer_64(plane_uv);
- return 0;
-}
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGB(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, src_pixel_stride_uv, dst_argb,
- dst_stride_argb, &kYuvI601Constants, width,
- height);
-}
-
-// Convert Android420 to ABGR.
-LIBYUV_API
-int Android420ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- int src_pixel_stride_uv,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height) {
- return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
- src_stride_u, src_pixel_stride_uv, dst_abgr,
- dst_stride_abgr, &kYvuI601Constants, width,
- height);
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
deleted file mode 100644
index 60140cb4..00000000
--- a/files/source/convert_from.cc
+++ /dev/null
@@ -1,1505 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/convert.h" // For I420Copy
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h" // For ScalePlane()
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-// I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int src_y_width,
- int src_y_height,
- int dst_uv_width,
- int dst_uv_height) {
- const int dst_y_width = Abs(src_y_width);
- const int dst_y_height = Abs(src_y_height);
- const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
- const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
- if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
- dst_uv_height <= 0) {
- return -1;
- }
- if (dst_y) {
- ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
- dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
- }
- ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
- dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
- ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
- dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
- return 0;
-}
-
-// Convert 8 bit YUV to 10 bit.
-LIBYUV_API
-int I420ToI010(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint16_t* dst_y,
- int dst_stride_y,
- uint16_t* dst_u,
- int dst_stride_u,
- uint16_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- // Convert Y plane.
- Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
- height);
- // Convert UV planes.
- Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
- halfheight);
- Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
- halfheight);
- return 0;
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 422 chroma is 1/2 width, 1x height
-LIBYUV_API
-int I420ToI422(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- const int dst_uv_width = (Abs(width) + 1) >> 1;
- const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, width, height, dst_uv_width,
- dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 444 chroma is 1x width, 1x height
-LIBYUV_API
-int I420ToI444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
- const int dst_uv_width = Abs(width);
- const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, width, height, dst_uv_width,
- dst_uv_height);
-}
-
-// Copy to I400. Source can be I420,422,444,400,NV12,NV21
-LIBYUV_API
-int I400Copy(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- int width,
- int height) {
- if (!src_y || !dst_y || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- return 0;
-}
-
-LIBYUV_API
-int I422ToYUY2(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_yuy2,
- int dst_stride_yuy2,
- int width,
- int height) {
- int y;
- void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
- const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
- I422ToYUY2Row_C;
- if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
- dst_stride_yuy2 = -dst_stride_yuy2;
- }
- // Coalesce rows.
- if (src_stride_y == width && src_stride_u * 2 == width &&
- src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
- }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_SSE2;
- }
- }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToYUY2Row = I422ToYUY2Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_yuy2 += dst_stride_yuy2;
- }
- return 0;
-}
-
-LIBYUV_API
-int I420ToYUY2(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_yuy2,
- int dst_stride_yuy2,
- int width,
- int height) {
- int y;
- void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
- const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
- I422ToYUY2Row_C;
- if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
- dst_stride_yuy2 = -dst_stride_yuy2;
- }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_SSE2;
- }
- }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToYUY2Row = I422ToYUY2Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToYUY2Row = I422ToYUY2Row_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToYUY2Row = I422ToYUY2Row_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
- I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
- dst_yuy2 + dst_stride_yuy2, width);
- src_y += src_stride_y * 2;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_yuy2 += dst_stride_yuy2 * 2;
- }
- if (height & 1) {
- I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
- }
- return 0;
-}
-
-LIBYUV_API
-int I422ToUYVY(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_uyvy,
- int dst_stride_uyvy,
- int width,
- int height) {
- int y;
- void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
- const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
- I422ToUYVYRow_C;
- if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
- dst_stride_uyvy = -dst_stride_uyvy;
- }
- // Coalesce rows.
- if (src_stride_y == width && src_stride_u * 2 == width &&
- src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
- }
-#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToUYVYRow = I422ToUYVYRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uyvy += dst_stride_uyvy;
- }
- return 0;
-}
-
-LIBYUV_API
-int I420ToUYVY(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_uyvy,
- int dst_stride_uyvy,
- int width,
- int height) {
- int y;
- void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
- const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
- I422ToUYVYRow_C;
- if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
- dst_stride_uyvy = -dst_stride_uyvy;
- }
-#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
- if (IS_ALIGNED(width, 32)) {
- I422ToUYVYRow = I422ToUYVYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToUYVYRow = I422ToUYVYRow_MMI;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
- I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
- dst_uyvy + dst_stride_uyvy, width);
- src_y += src_stride_y * 2;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uyvy += dst_stride_uyvy * 2;
- }
- if (height & 1) {
- I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
- }
- return 0;
-}
-
-// TODO(fbarchard): test negative height for invert.
-LIBYUV_API
-int I420ToNV12(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_uv,
- int dst_stride_uv,
- int width,
- int height) {
- if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
- height == 0) {
- return -1;
- }
- int halfwidth = (width + 1) / 2;
- int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
- halfwidth, halfheight);
- return 0;
-}
-
-LIBYUV_API
-int I420ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
- src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
- width, height);
-}
-
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB24Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToRGB24Row = I422ToRGB24Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height) {
- int y;
- void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB1555Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
- dst_stride_argb1555 = -dst_stride_argb1555;
- }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
- width);
- dst_argb1555 += dst_stride_argb1555;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height) {
- int y;
- void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB4444Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
- dst_stride_argb4444 = -dst_stride_argb4444;
- }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
- width);
- dst_argb4444 += dst_stride_argb4444;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565 with specified color matrix.
-LIBYUV_API
-int I420ToRGB565Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvI601Constants, width, height);
-}
-
-// Convert J420 to RGB565.
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert H420 to RGB565.
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvH709Constants, width, height);
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
- 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height) {
- int y;
- void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToARGBRow_C;
- void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
- const uint32_t dither4, int width) =
- ARGBToRGB565DitherRow_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
- if (!dither4x4) {
- dither4x4 = kDither565_4x4;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
- }
- }
-#endif
- {
- // Allocate a row of argb.
- align_buffer_64(row_argb, width * 4);
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
- ARGBToRGB565DitherRow(row_argb, dst_rgb565,
- *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
- width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- free_aligned_buffer_64(row_argb);
- }
- return 0;
-}
-
-// Convert I420 to AR30 with matrix
-static int I420ToAR30Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToAR30Row_C;
-
- if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
- dst_stride_ar30 = -dst_stride_ar30;
- }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToAR30Row = I422ToAR30Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToAR30Row = I422ToAR30Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToAR30Row = I422ToAR30Row_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
- dst_ar30 += dst_stride_ar30;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYvuH709Constants, width, height);
-}
-
-// Convert I420 to specified format
-LIBYUV_API
-int ConvertFromI420(const uint8_t* y,
- int y_stride,
- const uint8_t* u,
- int u_stride,
- const uint8_t* v,
- int v_stride,
- uint8_t* dst_sample,
- int dst_sample_stride,
- int width,
- int height,
- uint32_t fourcc) {
- uint32_t format = CanonicalFourCC(fourcc);
- int r = 0;
- if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
- return -1;
- }
- switch (format) {
- // Single plane formats
- case FOURCC_YUY2:
- r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2, width,
- height);
- break;
- case FOURCC_UYVY:
- r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2, width,
- height);
- break;
- case FOURCC_RGBP:
- r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2, width,
- height);
- break;
- case FOURCC_RGBO:
- r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_R444:
- r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_24BG:
- r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 3, width,
- height);
- break;
- case FOURCC_RAW:
- r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 3, width,
- height);
- break;
- case FOURCC_ARGB:
- r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4, width,
- height);
- break;
- case FOURCC_BGRA:
- r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4, width,
- height);
- break;
- case FOURCC_ABGR:
- r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4, width,
- height);
- break;
- case FOURCC_RGBA:
- r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4, width,
- height);
- break;
- case FOURCC_AR30:
- r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4, width,
- height);
- break;
- case FOURCC_I400:
- r = I400Copy(y, y_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width, width,
- height);
- break;
- case FOURCC_NV12: {
- uint8_t* dst_uv = dst_sample + width * height;
- r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width, dst_uv,
- dst_sample_stride ? dst_sample_stride : width, width,
- height);
- break;
- }
- case FOURCC_NV21: {
- uint8_t* dst_vu = dst_sample + width * height;
- r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride ? dst_sample_stride : width, dst_vu,
- dst_sample_stride ? dst_sample_stride : width, width,
- height);
- break;
- }
- // TODO(fbarchard): Add M420.
- // Triplanar formats
- case FOURCC_I420:
- case FOURCC_YV12: {
- dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
- int halfstride = (dst_sample_stride + 1) / 2;
- int halfheight = (height + 1) / 2;
- uint8_t* dst_u;
- uint8_t* dst_v;
- if (format == FOURCC_YV12) {
- dst_v = dst_sample + dst_sample_stride * height;
- dst_u = dst_v + halfstride * halfheight;
- } else {
- dst_u = dst_sample + dst_sample_stride * height;
- dst_v = dst_u + halfstride * halfheight;
- }
- r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
- width, height);
- break;
- }
- case FOURCC_I422:
- case FOURCC_YV16: {
- dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
- int halfstride = (dst_sample_stride + 1) / 2;
- uint8_t* dst_u;
- uint8_t* dst_v;
- if (format == FOURCC_YV16) {
- dst_v = dst_sample + dst_sample_stride * height;
- dst_u = dst_v + halfstride * height;
- } else {
- dst_u = dst_sample + dst_sample_stride * height;
- dst_v = dst_u + halfstride * height;
- }
- r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
- width, height);
- break;
- }
- case FOURCC_I444:
- case FOURCC_YV24: {
- dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
- uint8_t* dst_u;
- uint8_t* dst_v;
- if (format == FOURCC_YV24) {
- dst_v = dst_sample + dst_sample_stride * height;
- dst_u = dst_v + dst_sample_stride * height;
- } else {
- dst_u = dst_sample + dst_sample_stride * height;
- dst_v = dst_u + dst_sample_stride * height;
- }
- r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
- dst_sample_stride, dst_u, dst_sample_stride, dst_v,
- dst_sample_stride, width, height);
- break;
- }
- // Formats not supported - MJPG, biplanar, some rgb formats.
- default:
- return -1; // unknown fourcc - return failure code.
- }
- return r;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
deleted file mode 100644
index d414186a..00000000
--- a/files/source/rotate.cc
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/convert.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-void TransposePlane(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- int i = height;
-#if defined(HAS_TRANSPOSEWX16_MSA)
- void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
- int dst_stride, int width) = TransposeWx16_C;
-#else
- void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
- int dst_stride, int width) = TransposeWx8_C;
-#endif
-#if defined(HAS_TRANSPOSEWX8_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- TransposeWx8 = TransposeWx8_NEON;
- }
-#endif
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- TransposeWx8 = TransposeWx8_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- TransposeWx8 = TransposeWx8_SSSE3;
- }
- }
-#endif
-#if defined(HAS_TRANSPOSEWX8_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- TransposeWx8 = TransposeWx8_MMI;
- }
-#endif
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- TransposeWx8 = TransposeWx8_Fast_SSSE3;
- }
- }
-#endif
-#if defined(HAS_TRANSPOSEWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeWx16 = TransposeWx16_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- TransposeWx16 = TransposeWx16_MSA;
- }
- }
-#endif
-
-#if defined(HAS_TRANSPOSEWX16_MSA)
- // Work across the source in 16x16 tiles
- while (i >= 16) {
- TransposeWx16(src, src_stride, dst, dst_stride, width);
- src += 16 * src_stride; // Go down 16 rows.
- dst += 16; // Move over 16 columns.
- i -= 16;
- }
-#else
- // Work across the source in 8x8 tiles
- while (i >= 8) {
- TransposeWx8(src, src_stride, dst, dst_stride, width);
- src += 8 * src_stride; // Go down 8 rows.
- dst += 8; // Move over 8 columns.
- i -= 8;
- }
-#endif
-
- if (i > 0) {
- TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
- }
-}
-
-LIBYUV_API
-void RotatePlane90(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- // Rotate by 90 is a transpose with the source read
- // from bottom to top. So set the source pointer to the end
- // of the buffer and flip the sign of the source stride.
- src += src_stride * (height - 1);
- src_stride = -src_stride;
- TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane270(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- // Rotate by 270 is a transpose with the destination written
- // from bottom to top. So set the destination pointer to the end
- // of the buffer and flip the sign of the destination stride.
- dst += dst_stride * (width - 1);
- dst_stride = -dst_stride;
- TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane180(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- // Swap first and last row and mirror the content. Uses a temporary row.
- align_buffer_64(row, width);
- const uint8_t* src_bot = src + src_stride * (height - 1);
- uint8_t* dst_bot = dst + dst_stride * (height - 1);
- int half_height = (height + 1) >> 1;
- int y;
- void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
- void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- MirrorRow = MirrorRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- MirrorRow = MirrorRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
- }
- }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MirrorRow = MirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MirrorRow = MirrorRow_MMI;
- }
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_AVX)
- if (TestCpuFlag(kCpuHasAVX)) {
- CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
- }
-#endif
-
- // Odd height will harmlessly mirror the middle row twice.
- for (y = 0; y < half_height; ++y) {
- MirrorRow(src, row, width); // Mirror first row into a buffer
- src += src_stride;
- MirrorRow(src_bot, dst, width); // Mirror last row into first row
- dst += dst_stride;
- CopyRow(row, dst_bot, width); // Copy first mirrored row into last
- src_bot -= src_stride;
- dst_bot -= dst_stride;
- }
- free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-void TransposeUV(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
- int i = height;
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
- void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
- int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
- int width) = TransposeUVWx16_C;
-#else
- void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
- int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
- int width) = TransposeUVWx8_C;
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- TransposeUVWx8 = TransposeUVWx8_NEON;
- }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- TransposeUVWx8 = TransposeUVWx8_SSE2;
- }
- }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- TransposeUVWx8 = TransposeUVWx8_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- TransposeUVWx8 = TransposeUVWx8_MMI;
- }
- }
-#endif
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeUVWx16 = TransposeUVWx16_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- TransposeUVWx16 = TransposeUVWx16_MSA;
- }
- }
-#endif
-
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
- // Work through the source in 8x8 tiles.
- while (i >= 16) {
- TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
- width);
- src += 16 * src_stride; // Go down 16 rows.
- dst_a += 16; // Move over 8 columns.
- dst_b += 16; // Move over 8 columns.
- i -= 16;
- }
-#else
- // Work through the source in 8x8 tiles.
- while (i >= 8) {
- TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
- width);
- src += 8 * src_stride; // Go down 8 rows.
- dst_a += 8; // Move over 8 columns.
- dst_b += 8; // Move over 8 columns.
- i -= 8;
- }
-#endif
-
- if (i > 0) {
- TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
- width, i);
- }
-}
-
-LIBYUV_API
-void RotateUV90(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
- src += src_stride * (height - 1);
- src_stride = -src_stride;
-
- TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
- height);
-}
-
-LIBYUV_API
-void RotateUV270(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
- dst_a += dst_stride_a * (width - 1);
- dst_b += dst_stride_b * (width - 1);
- dst_stride_a = -dst_stride_a;
- dst_stride_b = -dst_stride_b;
-
- TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
- height);
-}
-
-// Rotate 180 is a horizontal and vertical flip.
-LIBYUV_API
-void RotateUV180(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
- int i;
- void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
- int width) = MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_NEON;
- }
-#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
- MirrorUVRow = MirrorUVRow_SSSE3;
- }
-#endif
-#if defined(HAS_MIRRORUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
- MirrorUVRow = MirrorUVRow_MSA;
- }
-#endif
-#if defined(HAS_MIRRORUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_MMI;
- }
-#endif
-
- dst_a += dst_stride_a * (height - 1);
- dst_b += dst_stride_b * (height - 1);
-
- for (i = 0; i < height; ++i) {
- MirrorUVRow(src, dst_a, dst_b, width);
- src += src_stride;
- dst_a -= dst_stride_a;
- dst_b -= dst_stride_b;
- }
-}
-
-LIBYUV_API
-int RotatePlane(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height,
- enum RotationMode mode) {
- if (!src || width <= 0 || height == 0 || !dst) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src = src + (height - 1) * src_stride;
- src_stride = -src_stride;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- CopyPlane(src, src_stride, dst, dst_stride, width, height);
- return 0;
- case kRotate90:
- RotatePlane90(src, src_stride, dst, dst_stride, width, height);
- return 0;
- case kRotate270:
- RotatePlane270(src, src_stride, dst, dst_stride, width, height);
- return 0;
- case kRotate180:
- RotatePlane180(src, src_stride, dst, dst_stride, width, height);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-LIBYUV_API
-int I420Rotate(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height,
- enum RotationMode mode) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
- !dst_u || !dst_v) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, width, height);
- case kRotate90:
- RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
- halfheight);
- RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
- halfheight);
- return 0;
- case kRotate270:
- RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
- halfheight);
- RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
- halfheight);
- return 0;
- case kRotate180:
- RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
- halfheight);
- RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
- halfheight);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-LIBYUV_API
-int I444Rotate(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height,
- enum libyuv::RotationMode mode) {
- if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
- !dst_u || !dst_v) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- switch (mode) {
- case libyuv::kRotate0:
- // copy frame
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
- return 0;
- case libyuv::kRotate90:
- RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
- RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
- return 0;
- case libyuv::kRotate270:
- RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
- RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
- return 0;
- case libyuv::kRotate180:
- RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
- RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-LIBYUV_API
-int NV12ToI420Rotate(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height,
- enum RotationMode mode) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
- !dst_v) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_uv = src_uv + (halfheight - 1) * src_stride_uv;
- src_stride_y = -src_stride_y;
- src_stride_uv = -src_stride_uv;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
- dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
- width, height);
- case kRotate90:
- RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
- dst_stride_v, halfwidth, halfheight);
- return 0;
- case kRotate270:
- RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
- dst_stride_v, halfwidth, halfheight);
- return 0;
- case kRotate180:
- RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
- dst_stride_v, halfwidth, halfheight);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
deleted file mode 100644
index ff212ade..00000000
--- a/files/source/rotate_common.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-void TransposeWx8_C(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- dst[0] = src[0 * src_stride];
- dst[1] = src[1 * src_stride];
- dst[2] = src[2 * src_stride];
- dst[3] = src[3 * src_stride];
- dst[4] = src[4 * src_stride];
- dst[5] = src[5 * src_stride];
- dst[6] = src[6 * src_stride];
- dst[7] = src[7 * src_stride];
- ++src;
- dst += dst_stride;
- }
-}
-
-void TransposeUVWx8_C(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- dst_a[0] = src[0 * src_stride + 0];
- dst_b[0] = src[0 * src_stride + 1];
- dst_a[1] = src[1 * src_stride + 0];
- dst_b[1] = src[1 * src_stride + 1];
- dst_a[2] = src[2 * src_stride + 0];
- dst_b[2] = src[2 * src_stride + 1];
- dst_a[3] = src[3 * src_stride + 0];
- dst_b[3] = src[3 * src_stride + 1];
- dst_a[4] = src[4 * src_stride + 0];
- dst_b[4] = src[4 * src_stride + 1];
- dst_a[5] = src[5 * src_stride + 0];
- dst_b[5] = src[5 * src_stride + 1];
- dst_a[6] = src[6 * src_stride + 0];
- dst_b[6] = src[6 * src_stride + 1];
- dst_a[7] = src[7 * src_stride + 0];
- dst_b[7] = src[7 * src_stride + 1];
- src += 2;
- dst_a += dst_stride_a;
- dst_b += dst_stride_b;
- }
-}
-
-void TransposeWxH_C(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- int i;
- for (i = 0; i < width; ++i) {
- int j;
- for (j = 0; j < height; ++j) {
- dst[i * dst_stride + j] = src[j * src_stride + i];
- }
- }
-}
-
-void TransposeUVWxH_C(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
- int i;
- for (i = 0; i < width * 2; i += 2) {
- int j;
- for (j = 0; j < height; ++j) {
- dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
- dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
- }
- }
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/rotate_dspr2.cc b/files/source/rotate_dspr2.cc
deleted file mode 100644
index 5d2338de..00000000
--- a/files/source/rotate_dspr2.cc
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
- "andi $t0, %[dst], 0x3 \n"
- "andi $t1, %[dst_stride], 0x3 \n"
- "or $t0, $t0, $t1 \n"
- "bnez $t0, 11f \n"
- " subu $t7, $t9, %[src_stride] \n"
- // dst + dst_stride word aligned
- "1: \n"
- "lbu $t0, 0(%[src]) \n"
- "lbux $t1, %[src_stride](%[src]) \n"
- "lbux $t8, $t2(%[src]) \n"
- "lbux $t9, $t3(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s0, $t8, $t0 \n"
- "lbux $t0, $t4(%[src]) \n"
- "lbux $t1, $t5(%[src]) \n"
- "lbux $t8, $t6(%[src]) \n"
- "lbux $t9, $t7(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s1, $t8, $t0 \n"
- "sw $s0, 0(%[dst]) \n"
- "addiu %[width], -1 \n"
- "addiu %[src], 1 \n"
- "sw $s1, 4(%[dst]) \n"
- "bnez %[width], 1b \n"
- " addu %[dst], %[dst], %[dst_stride] \n"
- "b 2f \n"
- // dst + dst_stride unaligned
- "11: \n"
- "lbu $t0, 0(%[src]) \n"
- "lbux $t1, %[src_stride](%[src]) \n"
- "lbux $t8, $t2(%[src]) \n"
- "lbux $t9, $t3(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s0, $t8, $t0 \n"
- "lbux $t0, $t4(%[src]) \n"
- "lbux $t1, $t5(%[src]) \n"
- "lbux $t8, $t6(%[src]) \n"
- "lbux $t9, $t7(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s1, $t8, $t0 \n"
- "swr $s0, 0(%[dst]) \n"
- "swl $s0, 3(%[dst]) \n"
- "addiu %[width], -1 \n"
- "addiu %[src], 1 \n"
- "swr $s1, 4(%[dst]) \n"
- "swl $s1, 7(%[dst]) \n"
- "bnez %[width], 11b \n"
- "addu %[dst], %[dst], %[dst_stride] \n"
- "2: \n"
- ".set pop \n"
- : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
- : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
-}
-
-void TransposeWx8_Fast_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width) {
- __asm__ __volatile__(
- ".set noat \n"
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
-
- "srl $AT, %[width], 0x2 \n"
- "andi $t0, %[dst], 0x3 \n"
- "andi $t1, %[dst_stride], 0x3 \n"
- "or $t0, $t0, $t1 \n"
- "bnez $t0, 11f \n"
- " subu $t7, $t9, %[src_stride] \n"
- // dst + dst_stride word aligned
- "1: \n"
- "lw $t0, 0(%[src]) \n"
- "lwx $t1, %[src_stride](%[src]) \n"
- "lwx $t8, $t2(%[src]) \n"
- "lwx $t9, $t3(%[src]) \n"
-
- // t0 = | 30 | 20 | 10 | 00 |
- // t1 = | 31 | 21 | 11 | 01 |
- // t8 = | 32 | 22 | 12 | 02 |
- // t9 = | 33 | 23 | 13 | 03 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
-
- "precr.qb.ph $s4, $s1, $s0 \n"
- "precrq.qb.ph $s5, $s1, $s0 \n"
- "precr.qb.ph $s6, $s3, $s2 \n"
- "precrq.qb.ph $s7, $s3, $s2 \n"
-
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
-
- "lwx $t0, $t4(%[src]) \n"
- "lwx $t1, $t5(%[src]) \n"
- "lwx $t8, $t6(%[src]) \n"
- "lwx $t9, $t7(%[src]) \n"
-
- // t0 = | 34 | 24 | 14 | 04 |
- // t1 = | 35 | 25 | 15 | 05 |
- // t8 = | 36 | 26 | 16 | 06 |
- // t9 = | 37 | 27 | 17 | 07 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
-
- "precr.qb.ph $t0, $s1, $s0 \n"
- "precrq.qb.ph $t1, $s1, $s0 \n"
- "precr.qb.ph $t8, $s3, $s2 \n"
- "precrq.qb.ph $t9, $s3, $s2 \n"
-
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
-
- "addu $s0, %[dst], %[dst_stride] \n"
- "addu $s1, $s0, %[dst_stride] \n"
- "addu $s2, $s1, %[dst_stride] \n"
-
- "sw $s4, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $s6, 0($s0) \n"
- "sw $t8, 4($s0) \n"
- "sw $s5, 0($s1) \n"
- "sw $t1, 4($s1) \n"
- "sw $s7, 0($s2) \n"
- "sw $t9, 4($s2) \n"
-
- "addiu $AT, -1 \n"
- "addiu %[src], 4 \n"
-
- "bnez $AT, 1b \n"
- " addu %[dst], $s2, %[dst_stride] \n"
- "b 2f \n"
- // dst + dst_stride unaligned
- "11: \n"
- "lw $t0, 0(%[src]) \n"
- "lwx $t1, %[src_stride](%[src]) \n"
- "lwx $t8, $t2(%[src]) \n"
- "lwx $t9, $t3(%[src]) \n"
-
- // t0 = | 30 | 20 | 10 | 00 |
- // t1 = | 31 | 21 | 11 | 01 |
- // t8 = | 32 | 22 | 12 | 02 |
- // t9 = | 33 | 23 | 13 | 03 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
-
- "precr.qb.ph $s4, $s1, $s0 \n"
- "precrq.qb.ph $s5, $s1, $s0 \n"
- "precr.qb.ph $s6, $s3, $s2 \n"
- "precrq.qb.ph $s7, $s3, $s2 \n"
-
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
-
- "lwx $t0, $t4(%[src]) \n"
- "lwx $t1, $t5(%[src]) \n"
- "lwx $t8, $t6(%[src]) \n"
- "lwx $t9, $t7(%[src]) \n"
-
- // t0 = | 34 | 24 | 14 | 04 |
- // t1 = | 35 | 25 | 15 | 05 |
- // t8 = | 36 | 26 | 16 | 06 |
- // t9 = | 37 | 27 | 17 | 07 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
-
- "precr.qb.ph $t0, $s1, $s0 \n"
- "precrq.qb.ph $t1, $s1, $s0 \n"
- "precr.qb.ph $t8, $s3, $s2 \n"
- "precrq.qb.ph $t9, $s3, $s2 \n"
-
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
-
- "addu $s0, %[dst], %[dst_stride] \n"
- "addu $s1, $s0, %[dst_stride] \n"
- "addu $s2, $s1, %[dst_stride] \n"
-
- "swr $s4, 0(%[dst]) \n"
- "swl $s4, 3(%[dst]) \n"
- "swr $t0, 4(%[dst]) \n"
- "swl $t0, 7(%[dst]) \n"
- "swr $s6, 0($s0) \n"
- "swl $s6, 3($s0) \n"
- "swr $t8, 4($s0) \n"
- "swl $t8, 7($s0) \n"
- "swr $s5, 0($s1) \n"
- "swl $s5, 3($s1) \n"
- "swr $t1, 4($s1) \n"
- "swl $t1, 7($s1) \n"
- "swr $s7, 0($s2) \n"
- "swl $s7, 3($s2) \n"
- "swr $t9, 4($s2) \n"
- "swl $t9, 7($s2) \n"
-
- "addiu $AT, -1 \n"
- "addiu %[src], 4 \n"
-
- "bnez $AT, 11b \n"
- " addu %[dst], $s2, %[dst_stride] \n"
- "2: \n"
- ".set pop \n"
- ".set at \n"
- : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
- : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
- "s2", "s3", "s4", "s5", "s6", "s7");
-}
-
-void TransposeUVWx8_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst_a,
- int dst_stride_a,
- uint8* dst_b,
- int dst_stride_b,
- int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
- "subu $t7, $t9, %[src_stride] \n"
- "srl $t1, %[width], 1 \n"
-
- // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
- "andi $t0, %[dst_a], 0x3 \n"
- "andi $t8, %[dst_b], 0x3 \n"
- "or $t0, $t0, $t8 \n"
- "andi $t8, %[dst_stride_a], 0x3 \n"
- "andi $s5, %[dst_stride_b], 0x3 \n"
- "or $t8, $t8, $s5 \n"
- "or $t0, $t0, $t8 \n"
- "bnez $t0, 11f \n"
- " nop \n"
- // dst + dst_stride word aligned (both, a & b dst addresses)
- "1: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
- "addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
- "addu $s6, %[dst_b], %[dst_stride_b] \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
-
- "sw $s3, 0($s5) \n"
- "sw $s4, 0($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
-
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
- "sw $s3, 0(%[dst_a]) \n"
- "sw $s4, 0(%[dst_b]) \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
- "sw $s3, 4($s5) \n"
- "sw $s4, 4($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
-
- "addiu %[src], 4 \n"
- "addiu $t1, -1 \n"
- "sll $t0, %[dst_stride_a], 1 \n"
- "sll $t8, %[dst_stride_b], 1 \n"
- "sw $s3, 4(%[dst_a]) \n"
- "sw $s4, 4(%[dst_b]) \n"
- "addu %[dst_a], %[dst_a], $t0 \n"
- "bnez $t1, 1b \n"
- " addu %[dst_b], %[dst_b], $t8 \n"
- "b 2f \n"
- " nop \n"
-
- // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
- "11: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
- "addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
- "addu $s6, %[dst_b], %[dst_stride_b] \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
-
- "swr $s3, 0($s5) \n"
- "swl $s3, 3($s5) \n"
- "swr $s4, 0($s6) \n"
- "swl $s4, 3($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
-
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
- "swr $s3, 0(%[dst_a]) \n"
- "swl $s3, 3(%[dst_a]) \n"
- "swr $s4, 0(%[dst_b]) \n"
- "swl $s4, 3(%[dst_b]) \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
-
- "swr $s3, 4($s5) \n"
- "swl $s3, 7($s5) \n"
- "swr $s4, 4($s6) \n"
- "swl $s4, 7($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
-
- "addiu %[src], 4 \n"
- "addiu $t1, -1 \n"
- "sll $t0, %[dst_stride_a], 1 \n"
- "sll $t8, %[dst_stride_b], 1 \n"
- "swr $s3, 4(%[dst_a]) \n"
- "swl $s3, 7(%[dst_a]) \n"
- "swr $s4, 4(%[dst_b]) \n"
- "swl $s4, 7(%[dst_b]) \n"
- "addu %[dst_a], %[dst_a], $t0 \n"
- "bnez $t1, 11b \n"
- " addu %[dst_b], %[dst_b], $t8 \n"
-
- "2: \n"
- ".set pop \n"
- : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
- [width] "+r"(width), [src_stride] "+r"(src_stride)
- : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
- "s2", "s3", "s4", "s5", "s6");
-}
-
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
deleted file mode 100644
index 04e19e29..00000000
--- a/files/source/rotate_gcc.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width) {
- asm volatile(
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
-
-// Transpose 16x8. 64 bit
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width) {
- asm volatile(
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
- "xmm15");
-}
-#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-
-// Transpose UV 8x8. 64 bit.
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width) {
- asm volatile(
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
- // Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(src_stride)), // %4
- "r"((intptr_t)(dst_stride_a)), // %5
- "r"((intptr_t)(dst_stride_b)) // %6
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7", "xmm8", "xmm9");
-}
-#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc
deleted file mode 100644
index f8de6083..00000000
--- a/files/source/rotate_mmi.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void TransposeWx8_MMI(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width) {
- uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
- uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
- uint8_t* src_tmp = nullptr;
-
- __asm__ volatile(
- "1: \n\t"
- "ldc1 %[tmp12], 0x00(%[src]) \n\t"
- "dadd %[src_tmp], %[src], %[src_stride] \n\t"
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp0 = (00 10 01 11 02 12 03 13) */
- "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
- /* tmp1 = (04 14 05 15 06 16 07 17) */
- "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
-
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp2 = (20 30 21 31 22 32 23 33) */
- "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
- /* tmp3 = (24 34 25 35 26 36 27 37) */
- "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
-
- /* tmp4 = (00 10 20 30 01 11 21 31) */
- "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
- /* tmp5 = (02 12 22 32 03 13 23 33) */
- "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
- /* tmp6 = (04 14 24 34 05 15 25 35) */
- "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
- /* tmp7 = (06 16 26 36 07 17 27 37) */
- "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
-
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp0 = (40 50 41 51 42 52 43 53) */
- "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
- /* tmp1 = (44 54 45 55 46 56 47 57) */
- "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
-
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp2 = (60 70 61 71 62 72 63 73) */
- "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
- /* tmp3 = (64 74 65 75 66 76 67 77) */
- "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
-
- /* tmp8 = (40 50 60 70 41 51 61 71) */
- "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
- /* tmp9 = (42 52 62 72 43 53 63 73) */
- "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
- /* tmp10 = (44 54 64 74 45 55 65 75) */
- "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
- /* tmp11 = (46 56 66 76 47 57 67 77) */
- "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
-
- /* tmp0 = (00 10 20 30 40 50 60 70) */
- "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
- /* tmp1 = (01 11 21 31 41 51 61 71) */
- "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
-
- /* tmp0 = (02 12 22 32 42 52 62 72) */
- "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
- /* tmp1 = (03 13 23 33 43 53 63 73) */
- "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
-
- /* tmp0 = (04 14 24 34 44 54 64 74) */
- "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
- /* tmp1 = (05 15 25 35 45 55 65 75) */
- "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
-
- /* tmp0 = (06 16 26 36 46 56 66 76) */
- "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
- /* tmp1 = (07 17 27 37 47 57 67 77) */
- "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
-
- "dadd %[dst], %[dst], %[dst_stride] \n\t"
- "daddi %[src], %[src], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
- [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
- [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
- [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
- [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
- [src_tmp] "+&r"(src_tmp)
- : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
- [dst_stride] "r"(dst_stride)
- : "memory");
-}
-
-void TransposeUVWx8_MMI(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width) {
- uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
- uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
- uint8_t* src_tmp = nullptr;
-
- __asm__ volatile(
- "1: \n\t"
- /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
- "ldc1 %[tmp12], 0x00(%[src]) \n\t"
- "dadd %[src_tmp], %[src], %[src_stride] \n\t"
- /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
- "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
- /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
- "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
-
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
- "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
- "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
- /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
- "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
-
- /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
- "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
- /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
- "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
- /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
- "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
- /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
- "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
-
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
- "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
- /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
- "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
- /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
- "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
-
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
- "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
- /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
- "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
- "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
-
- /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
- "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
- /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
- "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
-
- /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
- "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
- /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
- "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
- /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
- "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
- /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
- "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
-
- /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
- "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
- /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
- "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
-
- /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
- "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
- /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
- "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
- "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
- "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
-
- /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
- "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
- /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
- "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
- "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
- "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
-
- /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
- "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
- /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
- "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
- "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
- "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
- "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
- "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
- "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
- "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
-
- "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
- "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
- "daddiu %[src], %[src], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
-
- : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
- [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
- [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
- [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
- [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
- [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
- : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
- [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
- : "memory");
-}
-
-#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
deleted file mode 100644
index 06ca723a..00000000
--- a/files/source/row_any.cc
+++ /dev/null
@@ -1,1429 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <string.h> // For memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// memset for temp is meant to clear the source buffer (not dest) so that
-// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
-// memset is not needed for production, as the garbage values are processed but
-// not used, although there may be edge cases for subsampling.
-// The size of the buffer is based on the largest read, which can be inferred
-// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
-// the source code for how much the source pointers are advanced.
-
-// Subsampled source needs to be increase by 1 of not even.
-#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
-
-// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
- const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 5]); \
- memset(temp, 0, 64 * 4); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 192, a_buf + n, r); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
- yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
- SS(r, DUVSHIFT) * BPP); \
- }
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_NEON
-ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_MSA
-ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
-#endif
-#undef ANY41C
-
-// Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
- const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 4]); \
- memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
- SS(r, DUVSHIFT) * BPP); \
- }
-
-// Merge functions.
-#ifdef HAS_MERGERGBROW_SSSE3
-ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
-#endif
-#ifdef HAS_MERGERGBROW_NEON
-ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
-#endif
-#ifdef HAS_MERGERGBROW_MMI
-ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
-#endif
-#ifdef HAS_I422TOYUY2ROW_SSE2
-ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
-ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_AVX2
-ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
-ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOYUY2ROW_NEON
-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_MSA
-ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOYUY2ROW_MMI
-ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
-#endif
-#ifdef HAS_I422TOUYVYROW_NEON
-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOUYVYROW_MSA
-ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOUYVYROW_MMI
-ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
-#endif
-#ifdef HAS_BLENDPLANEROW_AVX2
-ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
-#endif
-#ifdef HAS_BLENDPLANEROW_SSSE3
-ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
-#endif
-#ifdef HAS_BLENDPLANEROW_MMI
-ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
-#endif
-#undef ANY31
-
-// Note that odd width replication includes 444 due to implementation
-// on arm that subsamples 444 to 422 internally.
-// Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
- const uint8_t* v_buf, uint8_t* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 4]); \
- memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- if (width & 1) { \
- temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
- temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \
- } \
- ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
- MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \
- SS(r, DUVSHIFT) * BPP); \
- }
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TOAR30ROW_SSSE3
-ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TOAR30ROW_AVX2
-ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_SSSE3
-ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
-#endif // HAS_I444TOARGBROW_SSSE3
-#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
-#endif
-#ifdef HAS_I422TOARGBROW_AVX2
-ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422TORGBAROW_AVX2
-ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_AVX2
-ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TOARGBROW_NEON
-ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
-ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGBROW_MSA
-ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
-ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
-#endif
-#undef ANY31C
-
-// Any 3 planes of 16 bit to 1 with yuvconstants
-// TODO(fbarchard): consider sharing this code with ANY31C
-#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
- void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \
- uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
- int width) { \
- SIMD_ALIGNED(T temp[16 * 3]); \
- SIMD_ALIGNED(uint8_t out[64]); \
- memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r * SBPP); \
- memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
- memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
- ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
- }
-
-#ifdef HAS_I210TOAR30ROW_SSSE3
-ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I210TOARGBROW_SSSE3
-ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I210TOARGBROW_AVX2
-ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I210TOAR30ROW_AVX2
-ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#undef ANY31CT
-
-// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
- int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-
-// Merge functions.
-#ifdef HAS_MERGEUVROW_SSE2
-ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
-#endif
-#ifdef HAS_MERGEUVROW_NEON
-ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_MSA
-ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_MMI
-ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
-#endif
-#ifdef HAS_NV21TOYUV24ROW_NEON
-ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-// Math functions.
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBADDROW_SSE2
-ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_AVX2
-ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_NEON
-ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_MSA
-ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_MMI
-ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBADDROW_MSA
-ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_MMI
-ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_MSA
-ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_MMI
-ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_SOBELROW_SSE2
-ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_NEON
-ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELROW_MSA
-ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_MMI
-ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_NEON
-ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_MSA
-ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_MMI
-ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
-#endif
-#ifdef HAS_SOBELXYROW_SSE2
-ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_NEON
-ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELXYROW_MSA
-ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_MMI
-ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
-#endif
-#undef ANY21
-
-// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 3]); \
- memset(temp, 0, 128 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
- }
-
-// Biplanar to RGB.
-#ifdef HAS_NV12TOARGBROW_SSSE3
-ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_AVX2
-ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV12TOARGBROW_NEON
-ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_MSA
-ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_SSSE3
-ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_AVX2
-ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV21TOARGBROW_NEON
-ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_MSA
-ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TORGB24ROW_NEON
-ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV21TORGB24ROW_NEON
-ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV12TORGB24ROW_SSSE3
-ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV21TORGB24ROW_SSSE3
-ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV12TORGB24ROW_AVX2
-ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-#ifdef HAS_NV21TORGB24ROW_AVX2
-ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_AVX2
-ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-#endif
-#ifdef HAS_NV12TORGB565ROW_NEON
-ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_MSA
-ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
-#endif
-#undef ANY21C
-
-// Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 2]); \
- memset(temp, 0, 128); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- ANY_SIMD(temp, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-
-#ifdef HAS_COPYROW_AVX
-ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
-#endif
-#ifdef HAS_COPYROW_SSE2
-ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
-#endif
-#ifdef HAS_COPYROW_NEON
-ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
-ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORAWROW_AVX2)
-ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ABGRTOAR30ROW_SSSE3)
-ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
-#endif
-#if defined(HAS_ARGBTOAR30ROW_SSSE3)
-ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
-#endif
-#if defined(HAS_ABGRTOAR30ROW_AVX2)
-ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
-#endif
-#if defined(HAS_ARGBTOAR30ROW_AVX2)
-ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_SSE2)
-ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MSA)
-ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
-ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
-ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
-ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_MSA)
-ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
-#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
-ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
-#endif
-#ifdef HAS_ARGBTOYROW_AVX2
-ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYJROW_AVX2
-ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_AVX2
-ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_AVX2
-ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
-ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_SSSE3
-ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_NEON
-ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYROW_MSA
-ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_MMI
-ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_NEON
-ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_MSA
-ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_MMI
-ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_NEON
-ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_MSA
-ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_MMI
-ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_NEON
-ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_MSA
-ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_MMI
-ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_NEON
-ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_MSA
-ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_RGBATOYROW_MMI
-ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_NEON
-ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_MSA
-ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RGB24TOYROW_MMI
-ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_NEON
-ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_MSA
-ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RAWTOYROW_MMI
-ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_NEON
-ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_MSA
-ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
-#endif
-#ifdef HAS_RGB565TOYROW_MMI
-ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_NEON
-ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_MSA
-ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
-#endif
-#ifdef HAS_ARGB1555TOYROW_MMI
-ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_NEON
-ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_MMI
-ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOYROW_MSA
-ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_MMI
-ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
-#endif
-#ifdef HAS_UYVYTOYROW_MSA
-ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_MMI
-ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
-#endif
-#ifdef HAS_AYUVTOYROW_NEON
-ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
-#endif
-#ifdef HAS_AYUVTOYROW_NEON
-ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_NEON
-ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RGB24TOARGBROW_MSA
-ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_MMI
-ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
-#endif
-#ifdef HAS_RAWTOARGBROW_NEON
-ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RAWTOARGBROW_MSA
-ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
-#endif
-#ifdef HAS_RAWTOARGBROW_MMI
-ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
-#endif
-#ifdef HAS_RGB565TOARGBROW_NEON
-ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_RGB565TOARGBROW_MSA
-ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_RGB565TOARGBROW_MMI
-ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_MSA
-ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_MMI
-ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_MSA
-ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_MMI
-ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_NEON
-ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_MSA
-ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_MMI
-ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
-ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
-ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
-ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
-#endif
-#undef ANY11
-
-// Any 1 to 1 blended. Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 2]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \
- ANY_SIMD(temp, temp + 64, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
- }
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_MMI
-ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
-ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
-#endif
-#undef ANY11B
-
-// Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 2]); \
- memset(temp, 0, 64); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, param, n); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp, temp + 64, param, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
- }
-
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
- ARGBToRGB565DitherRow_SSE2,
- const uint32_t,
- 4,
- 2,
- 3)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
- ARGBToRGB565DitherRow_AVX2,
- const uint32_t,
- 4,
- 2,
- 7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON,
- ARGBToRGB565DitherRow_NEON,
- const uint32_t,
- 4,
- 2,
- 7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-ANY11P(ARGBToRGB565DitherRow_Any_MSA,
- ARGBToRGB565DitherRow_MSA,
- const uint32_t,
- 4,
- 2,
- 7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-ANY11P(ARGBToRGB565DitherRow_Any_MMI,
- ARGBToRGB565DitherRow_MMI,
- const uint32_t,
- 4,
- 2,
- 3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_MSA
-ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_MMI
-ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
-#endif
-#undef ANY11P
-#undef ANY11P
-
-// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
-#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
- void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
- SIMD_ALIGNED(STYPE temp[32]); \
- SIMD_ALIGNED(DTYPE out[32]); \
- memset(temp, 0, 32 * SBPP); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, scale, n); \
- } \
- memcpy(temp, src_ptr + n, r * SBPP); \
- ANY_SIMD(temp, out, scale, MASK + 1); \
- memcpy(dst_ptr + n, out, r * BPP); \
- }
-
-#ifdef HAS_CONVERT16TO8ROW_SSSE3
-ANY11C(Convert16To8Row_Any_SSSE3,
- Convert16To8Row_SSSE3,
- 2,
- 1,
- uint16_t,
- uint8_t,
- 15)
-#endif
-#ifdef HAS_CONVERT16TO8ROW_AVX2
-ANY11C(Convert16To8Row_Any_AVX2,
- Convert16To8Row_AVX2,
- 2,
- 1,
- uint16_t,
- uint8_t,
- 31)
-#endif
-#ifdef HAS_CONVERT8TO16ROW_SSE2
-ANY11C(Convert8To16Row_Any_SSE2,
- Convert8To16Row_SSE2,
- 1,
- 2,
- uint8_t,
- uint16_t,
- 15)
-#endif
-#ifdef HAS_CONVERT8TO16ROW_AVX2
-ANY11C(Convert8To16Row_Any_AVX2,
- Convert8To16Row_AVX2,
- 1,
- 2,
- uint8_t,
- uint16_t,
- 31)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
-#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
- void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
- SIMD_ALIGNED(ST temp[32]); \
- SIMD_ALIGNED(T out[32]); \
- memset(temp, 0, SBPP * 32); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, param, n); \
- } \
- memcpy(temp, src_ptr + n, r * SBPP); \
- ANY_SIMD(temp, out, param, MASK + 1); \
- memcpy(dst_ptr + n, out, r * BPP); \
- }
-
-#ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
-#endif
-#ifdef HAS_HALFFLOATROW_AVX2
-ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
-#endif
-#ifdef HAS_HALFFLOATROW_F16C
-ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_F16C,
- HalfFloat1Row_F16C,
- uint16_t,
- uint16_t,
- 2,
- 2,
- 15)
-#endif
-#ifdef HAS_HALFFLOATROW_NEON
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON,
- HalfFloat1Row_NEON,
- uint16_t,
- uint16_t,
- 2,
- 2,
- 7)
-#endif
-#ifdef HAS_HALFFLOATROW_MSA
-ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
-#endif
-#ifdef HAS_BYTETOFLOATROW_NEON
-ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
-#endif
-#undef ANY11P16
-
-// Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 2]); \
- memset(temp, 0, 128); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
-ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
-ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
-#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
-ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \
- ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
- SIMD_ALIGNED(uint8_t temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_NEON
-ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_MSA
-ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_MMI
-ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
-#endif
-#undef ANY11T
-
-// Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 2]); \
- memset(temp, 0, 64); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr, r* BPP); \
- ANY_SIMD(temp, temp + 64, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
- }
-
-#ifdef HAS_MIRRORROW_AVX2
-ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
-#endif
-#ifdef HAS_MIRRORROW_SSSE3
-ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
-#endif
-#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
-#endif
-#ifdef HAS_MIRRORROW_MSA
-ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
-#endif
-#ifdef HAS_MIRRORROW_MMI
-ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_AVX2
-ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_SSE2
-ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
-#endif
-#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
-#endif
-#ifdef HAS_ARGBMIRRORROW_MSA
-ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
-#endif
-#ifdef HAS_ARGBMIRRORROW_MMI
-ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
-#endif
-#undef ANY11M
-
-// Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
- SIMD_ALIGNED(uint8_t temp[64]); \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, v32, n); \
- } \
- ANY_SIMD(temp, v32, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp, r * BPP); \
- }
-
-#ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
-#endif
-#ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
-#endif
-#ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
-#endif
-#ifdef HAS_ARGBSETROW_MSA
-ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
-#endif
-#undef ANY1
-
-// Any 1 to 2. Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
- int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 3]); \
- memset(temp, 0, 128); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_u, dst_v, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
- memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
- memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
- }
-
-#ifdef HAS_SPLITUVROW_SSE2
-ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_AVX2
-ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_NEON
-ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_MSA
-ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_MMI
-ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
-#endif
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_AVX2
-ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_SSE2
-ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_MSA
-ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
-ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_MMI
-ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
-ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
-ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
-#endif
-#undef ANY12
-
-// Any 1 to 3. Outputs RGB planes.
-#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
- uint8_t* dst_b, int width) { \
- SIMD_ALIGNED(uint8_t temp[16 * 6]); \
- memset(temp, 0, 16 * 3); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
- } \
- memcpy(temp, src_ptr + n * BPP, r * BPP); \
- ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
- memcpy(dst_r + n, temp + 16 * 3, r); \
- memcpy(dst_g + n, temp + 16 * 4, r); \
- memcpy(dst_b + n, temp + 16 * 5, r); \
- }
-
-#ifdef HAS_SPLITRGBROW_SSSE3
-ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
-#endif
-#ifdef HAS_SPLITRGBROW_NEON
-ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
-#endif
-#ifdef HAS_SPLITRGBROW_MMI
-ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
-#endif
-
-// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \
- uint8_t* dst_v, int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 4]); \
- memset(temp, 0, 128 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
- SS(r, UVSHIFT) * BPP); \
- if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
- memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
- BPP); \
- memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
- temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- } \
- ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
- memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
- memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
- }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_AVX2
-ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
-ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
-ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_AVX2
-ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
-ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
-ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
-ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_NEON
-ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_MSA
-ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_MMI
-ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_NEON
-ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_MSA
-ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_MMI
-ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_NEON
-ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_BGRATOUVROW_MMI
-ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_NEON
-ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ABGRTOUVROW_MMI
-ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_NEON
-ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_RGBATOUVROW_MMI
-ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_NEON
-ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_MSA
-ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_MMI
-ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_NEON
-ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_MSA
-ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_MMI
-ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_NEON
-ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_MSA
-ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_MMI
-ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_NEON
-ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_MSA
-ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_MMI
-ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_NEON
-ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_MMI
-ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_NEON
-ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_MSA
-ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_MMI
-ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_MSA
-ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
-#endif
-#ifdef HAS_UYVYTOUVROW_MMI
-ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
-#endif
-#undef ANY12S
-
-// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \
- int width) { \
- SIMD_ALIGNED(uint8_t temp[128 * 3]); \
- memset(temp, 0, 128 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
- SS(r, UVSHIFT) * BPP); \
- if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
- memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
- BPP); \
- memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
- temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- } \
- ANY_SIMD(temp, 128, temp + 256, MASK + 1); \
- memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \
- }
-
-#ifdef HAS_AYUVTOVUROW_NEON
-ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
-ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
-#endif
-#undef ANY11S
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc
deleted file mode 100644
index 11f78e0d..00000000
--- a/files/source/row_dspr2.cc
+++ /dev/null
@@ -1,1721 +0,0 @@
-/*
- * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
- __asm__ __volatile__(
- ".set noreorder \n"
- ".set noat \n"
- "slti $at, %[count], 8 \n"
- "bne $at ,$zero, $last8 \n"
- "xor $t8, %[src], %[dst] \n"
- "andi $t8, $t8, 0x3 \n"
-
- "bne $t8, $zero, unaligned \n"
- "negu $a3, %[dst] \n"
- // make dst/src aligned
- "andi $a3, $a3, 0x3 \n"
- "beq $a3, $zero, $chk16w \n"
- // word-aligned now count is the remining bytes count
- "subu %[count], %[count], $a3 \n"
-
- "lwr $t8, 0(%[src]) \n"
- "addu %[src], %[src], $a3 \n"
- "swr $t8, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
-
- // Now the dst/src are mutually word-aligned with word-aligned addresses
- "$chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, chk8w \n"
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n"
- // t0 is the "past the end" address
-
- // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
- // past
- // the "t0-32" address
- // This means: for x=128 the last "safe" a1 address is "t0-160"
- // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
- // we will use "pref 30,128(a1)", so "t0-160" is the limit
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line of src
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // In case the a1 > t9 don't use "pref 30" at all
- "sltu $v1, $t9, %[dst] \n"
- "bgtz $v1, $loop16w \n"
- "nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lw $t0, 0(%[src]) \n"
- "bgtz $v1, $skip_pref30_96 \n" // skip
- "lw $t1, 4(%[src]) \n"
- "pref 30, 96(%[dst]) \n" // continue
- "$skip_pref30_96: \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lw $t0, 32(%[src]) \n"
- "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
- "lw $t1, 36(%[src]) \n"
- "pref 30, 128(%[dst]) \n" // set dest, addr 128
- "$skip_pref30_128: \n"
- "lw $t2, 40(%[src]) \n"
- "lw $t3, 44(%[src]) \n"
- "lw $t4, 48(%[src]) \n"
- "lw $t5, 52(%[src]) \n"
- "lw $t6, 56(%[src]) \n"
- "lw $t7, 60(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
- "sltu $v1, $t9, %[dst] \n"
- "bne %[dst], $a3, $loop16w \n"
- " addiu %[src], %[src], 64 \n" // adding 64 to src
- "move %[count], $t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count past 32-bytes
- "beq %[count], $t8, chk1w \n"
- // count=t8,no 32-byte chunk
- " nop \n"
-
- "lw $t0, 0(%[src]) \n"
- "lw $t1, 4(%[src]) \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, $last8 \n"
- " subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
- // copying in words (4-byte chunks)
- "$wordCopy_loop: \n"
- "lw $t3, 0(%[src]) \n"
- // the first t3 may be equal t0 ... optimize?
- "addiu %[src], %[src],4 \n"
- "addiu %[dst], %[dst],4 \n"
- "bne %[dst], $a3,$wordCopy_loop \n"
- " sw $t3, -4(%[dst]) \n"
-
- // For the last (<8) bytes
- "$last8: \n"
- "blez %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 -last dst address
- "$last8loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst], $a3, $last8loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "leave: \n"
- " j $ra \n"
- " nop \n"
-
- //
- // UNALIGNED case
- //
-
- "unaligned: \n"
- // got here with a3="negu a1"
- "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
- "beqz $a3, $ua_chk16w \n"
- " subu %[count], %[count], $a3 \n"
- // bytes left after initial a3 bytes
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
- "swr $v1, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
- // below the dst will be word aligned (NOTE1)
- "$ua_chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, ua_chk8w \n"
- // if a2==t8, no 64-byte chunks
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n" // t0 "past the end"
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line addr 32
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // safe, as we have at least 64 bytes ahead
- // In case the a1 > t9 don't use "pref 30" at all
- "sltu $v1, $t9, %[dst] \n"
- "bgtz $v1, $ua_loop16w \n"
- // skip "pref 30,64(a1)" for too short arrays
- " nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$ua_loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "bgtz $v1, $ua_skip_pref30_96 \n"
- " lwl $t1, 7(%[src]) \n"
- "pref 30, 96(%[dst]) \n"
- // continue setting up the dest, addr 96
- "$ua_skip_pref30_96: \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lwr $t0, 32(%[src]) \n"
- "lwl $t0, 35(%[src]) \n"
- "lwr $t1, 36(%[src]) \n"
- "bgtz $v1, ua_skip_pref30_128 \n"
- " lwl $t1, 39(%[src]) \n"
- "pref 30, 128(%[dst]) \n"
- // continue setting up the dest, addr 128
- "ua_skip_pref30_128: \n"
-
- "lwr $t2, 40(%[src]) \n"
- "lwl $t2, 43(%[src]) \n"
- "lwr $t3, 44(%[src]) \n"
- "lwl $t3, 47(%[src]) \n"
- "lwr $t4, 48(%[src]) \n"
- "lwl $t4, 51(%[src]) \n"
- "lwr $t5, 52(%[src]) \n"
- "lwl $t5, 55(%[src]) \n"
- "lwr $t6, 56(%[src]) \n"
- "lwl $t6, 59(%[src]) \n"
- "lwr $t7, 60(%[src]) \n"
- "lwl $t7, 63(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst],%[dst],64 \n" // adding 64 to dest
- "sltu $v1,$t9,%[dst] \n"
- "bne %[dst],$a3,$ua_loop16w \n"
- " addiu %[src],%[src],64 \n" // adding 64 to src
- "move %[count],$t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "ua_chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count
- "beq %[count], $t8, $ua_chk1w \n"
- // when count==t8, no 32-byte chunk
-
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "lwl $t1, 7(%[src]) \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "$ua_chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, ua_smallCopy \n"
- "subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
-
- // copying in words (4-byte chunks)
- "$ua_wordCopy_loop: \n"
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addiu %[src], %[src], 4 \n"
- "addiu %[dst], %[dst], 4 \n"
- // note: dst=a1 is word aligned here, see NOTE1
- "bne %[dst], $a3, $ua_wordCopy_loop \n"
- " sw $v1,-4(%[dst]) \n"
-
- // Now less than 4 bytes (value in count) left to copy
- "ua_smallCopy: \n"
- "beqz %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 = last dst address
- "$ua_smallCopy_loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst],$a3,$ua_smallCopy_loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "j $ra \n"
- " nop \n"
- ".set at \n"
- ".set reorder \n"
- : [dst] "+r"(dst), [src] "+r"(src)
- : [count] "r"(count)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
- "at");
-}
-#endif // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
- (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 |
- // U10
- "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 |
- // U12
- "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 |
- // U14
- "addiu %[src_uv], %[src_uv], 32 \n"
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 |
- // V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 |
- // U12
- "sw $t9, 0(%[dst_v]) \n"
- "sw $t0, 0(%[dst_u]) \n"
- "sw $t1, 4(%[dst_v]) \n"
- "sw $t2, 4(%[dst_u]) \n"
- "sw $t3, 8(%[dst_v]) \n"
- "sw $t5, 8(%[dst_u]) \n"
- "sw $t6, 12(%[dst_v]) \n"
- "sw $t7, 12(%[dst_u]) \n"
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
- [dst_v] "+r"(dst_v)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "andi $t5, %[width], 0xf \n"
- "blez $t4, 2f \n"
- " addu %[src], %[src], %[width] \n" // src += width
-
- "1: \n"
- "lw $t0, -16(%[src]) \n" // |3|2|1|0|
- "lw $t1, -12(%[src]) \n" // |7|6|5|4|
- "lw $t2, -8(%[src]) \n" // |11|10|9|8|
- "lw $t3, -4(%[src]) \n" // |15|14|13|12|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t1, $t1 \n" // |6|7|4|5|
- "wsbh $t2, $t2 \n" // |10|11|8|9|
- "wsbh $t3, $t3 \n" // |14|15|12|13|
- "rotr $t0, $t0, 16 \n" // |0|1|2|3|
- "rotr $t1, $t1, 16 \n" // |4|5|6|7|
- "rotr $t2, $t2, 16 \n" // |8|9|10|11|
- "rotr $t3, $t3, 16 \n" // |12|13|14|15|
- "addiu %[src], %[src], -16 \n"
- "addiu $t4, $t4, -1 \n"
- "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
- "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
- "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
- "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
- "bgtz $t4, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
- "beqz $t5, 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -1(%[src]) \n"
- "addiu $t5, $t5, -1 \n"
- "addiu %[src], %[src], -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgez $t5, 2b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src] "+r"(src), [dst] "+r"(dst)
- : [width] "r"(width)
- : "t0", "t1", "t2", "t3", "t4", "t5");
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- int x;
- int y;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "addu $t4, %[width], %[width] \n"
- "srl %[x], %[width], 4 \n"
- "andi %[y], %[width], 0xf \n"
- "blez %[x], 2f \n"
- " addu %[src_uv], %[src_uv], $t4 \n"
-
- "1: \n"
- "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
- "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
- "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
- "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
- "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
- "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
- "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
- "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
-
- "rotr $t0, $t0, 16 \n" // |1|0|3|2|
- "rotr $t1, $t1, 16 \n" // |5|4|7|6|
- "rotr $t2, $t2, 16 \n" // |9|8|11|10|
- "rotr $t3, $t3, 16 \n" // |13|12|15|14|
- "rotr $t4, $t4, 16 \n" // |17|16|19|18|
- "rotr $t6, $t6, 16 \n" // |21|20|23|22|
- "rotr $t7, $t7, 16 \n" // |25|24|27|26|
- "rotr $t8, $t8, 16 \n" // |29|28|31|30|
- "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
- "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
- "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
- "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
- "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
- "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
- "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
- "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
- "addiu %[src_uv], %[src_uv], -32 \n"
- "addiu %[x], %[x], -1 \n"
- "swr $t4, 0(%[dst_u]) \n"
- "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
- "swr $t6, 0(%[dst_v]) \n"
- "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
- "swr $t3, 4(%[dst_v]) \n"
- "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
- "swr $t0, 8(%[dst_u]) \n"
- "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
- "swr $t1, 8(%[dst_v]) \n"
- "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
- "swr $t9, 12(%[dst_u]) \n"
- "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
- "swr $t5, 12(%[dst_v]) \n"
- "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz %[x], 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
- "beqz %[y], 3f \n"
- " nop \n"
- "b 2f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -2(%[src_uv]) \n"
- "lbu $t1, -1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], -2 \n"
- "addiu %[y], %[y], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[y], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
- [x] "=&r"(x), [y] "=&r"(y)
- : [width] "r"(width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
-}
-
-void I422ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_yg;
- uint32 tmp_mask = 0x7fff7fff;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_u]) \n"
- "lbu %[tmp_t3], 0(%[src_v]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "sw %[tmp_t8], 0(%[rgb_buf]) \n"
- "sw %[tmp_t7], 4(%[rgb_buf]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
- [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
- [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
- [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
- [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 8; // Advance 4 pixels.
- }
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- int y0_fraction = 256 - source_y_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "replv.ph $t0, %[y0_fraction] \n"
- "replv.ph $t1, %[source_y_fraction] \n"
-
- "1: \n"
- "lw $t2, 0(%[src_ptr]) \n"
- "lw $t3, 0(%[src_ptr1]) \n"
- "lw $t4, 4(%[src_ptr]) \n"
- "lw $t5, 4(%[src_ptr1]) \n"
- "muleu_s.ph.qbl $t6, $t2, $t0 \n"
- "muleu_s.ph.qbr $t7, $t2, $t0 \n"
- "muleu_s.ph.qbl $t8, $t3, $t1 \n"
- "muleu_s.ph.qbr $t9, $t3, $t1 \n"
- "muleu_s.ph.qbl $t2, $t4, $t0 \n"
- "muleu_s.ph.qbr $t3, $t4, $t0 \n"
- "muleu_s.ph.qbl $t4, $t5, $t1 \n"
- "muleu_s.ph.qbr $t5, $t5, $t1 \n"
- "addq.ph $t6, $t6, $t8 \n"
- "addq.ph $t7, $t7, $t9 \n"
- "addq.ph $t2, $t2, $t4 \n"
- "addq.ph $t3, $t3, $t5 \n"
- "shra_r.ph $t6, $t6, 8 \n"
- "shra_r.ph $t7, $t7, 8 \n"
- "shra_r.ph $t2, $t2, 8 \n"
- "shra_r.ph $t3, $t3, 8 \n"
- "precr.qb.ph $t6, $t6, $t7 \n"
- "precr.qb.ph $t2, $t2, $t3 \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[src_ptr1], %[src_ptr1], 8 \n"
- "addiu %[dst_width], %[dst_width], -8 \n"
- "sw $t6, 0(%[dst_ptr]) \n"
- "sw $t2, 4(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[dst_ptr], %[dst_ptr], 8 \n"
-
- ".set pop \n"
- : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
- [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
- : [source_y_fraction] "r"(source_y_fraction),
- [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-#include <stdio.h>
-void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
- int x;
- uint32 tmp_mask = 0xff;
- uint32 tmp_t1;
- for (x = 0; x < (width - 1); ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "ulw %[tmp_t1], 0(%[src_rgb24]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_rgb24], %[src_rgb24], 3 \n"
- "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
- "sw %[tmp_t1], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
- [tmp_t1] "=&r"(tmp_t1)
- : [tmp_mask] "r"(tmp_mask)
- : "memory");
- }
- uint8 b = src_rgb24[0];
- uint8 g = src_rgb24[1];
- uint8 r = src_rgb24[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
-}
-
-void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
- int x;
- uint32 tmp_mask = 0xff;
- uint32 tmp_t1, tmp_t2;
- for (x = 0; x < (width - 1); ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "ulw %[tmp_t1], 0(%[src_raw]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_raw], %[src_raw], 3 \n"
- "srl %[tmp_t2], %[tmp_t1], 16 \n"
- "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
- "ins %[tmp_t1], %[tmp_t1], 16, 8 \n"
- "ins %[tmp_t1], %[tmp_t2], 0, 8 \n"
- "sw %[tmp_t1], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
- [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
- : [tmp_mask] "r"(tmp_mask)
- : "memory");
- }
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
-}
-
-void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
- uint8* dst_argb,
- int width) {
- int x;
- uint32 tmp_mask = 0xff;
- uint32 tmp_t1, tmp_t2, tmp_t3;
- for (x = 0; x < width; ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lhu %[tmp_t1], 0(%[src_rgb565]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_rgb565], %[src_rgb565], 2 \n"
- "sll %[tmp_t2], %[tmp_t1], 8 \n"
- "ins %[tmp_t2], %[tmp_mask], 24,8 \n"
- "ins %[tmp_t2], %[tmp_t1], 3, 16 \n"
- "ins %[tmp_t2], %[tmp_t1], 5, 11 \n"
- "srl %[tmp_t3], %[tmp_t1], 9 \n"
- "ins %[tmp_t2], %[tmp_t3], 8, 2 \n"
- "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
- "srl %[tmp_t3], %[tmp_t1], 2 \n"
- "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
- "sw %[tmp_t2], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
- [dst_argb] "+r"(dst_argb)
- : [tmp_mask] "r"(tmp_mask));
- }
-}
-
-void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width) {
- int x;
- uint32 tmp_t1, tmp_t2, tmp_t3;
- for (x = 0; x < width; ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lh %[tmp_t1], 0(%[src_argb1555]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_argb1555], %[src_argb1555], 2 \n"
- "sll %[tmp_t2], %[tmp_t1], 9 \n"
- "ins %[tmp_t2], %[tmp_t1], 4, 15 \n"
- "ins %[tmp_t2], %[tmp_t1], 6, 10 \n"
- "srl %[tmp_t3], %[tmp_t1], 7 \n"
- "ins %[tmp_t2], %[tmp_t3], 8, 3 \n"
- "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
- "srl %[tmp_t3], %[tmp_t1], 2 \n"
- "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
- "sw %[tmp_t2], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
- [dst_argb] "+r"(dst_argb)
- :);
- }
-}
-
-void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width) {
- int x;
- uint32 tmp_t1;
- for (x = 0; x < width; ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lh %[tmp_t1], 0(%[src_argb4444]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_argb4444], %[src_argb4444], 2 \n"
- "ins %[tmp_t1], %[tmp_t1], 16, 16 \n"
- "ins %[tmp_t1], %[tmp_t1], 12, 16 \n"
- "ins %[tmp_t1], %[tmp_t1], 8, 12 \n"
- "ins %[tmp_t1], %[tmp_t1], 4, 8 \n"
- "sw %[tmp_t1], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
- [tmp_t1] "=&r"(tmp_t1));
- }
-}
-
-void I444ToARGBRow_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_mask = 0x7fff7fff;
- uint32 tmp_yg;
-
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[y_buf]) \n"
- "lbu %[tmp_t1], 1(%[y_buf]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lh %[tmp_t2], 0(%[u_buf]) \n"
- "lh %[tmp_t3], 0(%[v_buf]) \n"
- "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "sw %[tmp_t8], 0(%[rgb_buf]) \n"
- "sw %[tmp_t7], 4(%[rgb_buf]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
- [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
- [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
- [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
- [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
- y_buf += 2;
- u_buf += 2;
- v_buf += 2;
- rgb_buf += 8; // Advance 1 pixel.
- }
-}
-
-void I422ToARGB4444Row_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_yg;
- uint32 tmp_mask = 0x7fff7fff;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_u]) \n"
- "lbu %[tmp_t3], 0(%[src_v]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n"
- "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n"
- "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n"
- "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n"
- "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n"
- "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n"
- "sw %[tmp_t8], 0(%[dst_argb4444]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
- [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
- [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
- [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
- [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_argb4444 += 4; // Advance 2 pixels.
- }
-}
-
-void I422ToARGB1555Row_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_yg;
- uint32 tmp_mask = 0x80008000;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_u]) \n"
- "lbu %[tmp_t3], 0(%[src_v]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "ins %[tmp_t3], %[tmp_t8], 7, 24 \n"
- "ins %[tmp_t3], %[tmp_t8], 10, 16 \n"
- "ins %[tmp_t3], %[tmp_t8], 13, 8 \n"
- "ins %[tmp_t4], %[tmp_t7], 7, 24 \n"
- "ins %[tmp_t4], %[tmp_t7], 10, 16 \n"
- "ins %[tmp_t4], %[tmp_t7], 13, 8 \n"
- "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n"
- "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n"
- "sw %[tmp_t8], 0(%[dst_argb1555]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
- [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
- [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
- [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
- [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_argb1555 += 4; // Advance 2 pixels.
- }
-}
-
-void NV12ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_uv,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_mask = 0x7fff7fff;
- uint32 tmp_yg;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_uv]) \n"
- "lbu %[tmp_t3], 1(%[src_uv]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "sw %[tmp_t8], 0(%[rgb_buf]) \n"
- "sw %[tmp_t7], 4(%[rgb_buf]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
- [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
- [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
- [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
- [tmp_mask] "r"(tmp_mask));
-
- src_y += 2;
- src_uv += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
-}
-
-void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffda0000;
- int const2 = 0x0070ffb6;
- int const3 = 0x00700000;
- int const4 = 0xffeeffa2;
- int const5 = 0x100;
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_rgb0]) \n"
- "lw %[tmp_t2], 4(%[src_rgb0]) \n"
- "lw %[tmp_t3], 0(%[src_rgb1]) \n"
- "lw %[tmp_t4], 4(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00420000;
- int const2 = 0x00190081;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffb6ffda;
- int const2 = 0x00000070;
- int const3 = 0xffa20070;
- int const4 = 0x0000ffee;
- int const5 = 0x100;
-
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_rgb0]) \n"
- "lw %[tmp_t2], 4(%[src_rgb0]) \n"
- "lw %[tmp_t3], 0(%[src_rgb1]) \n"
- "lw %[tmp_t4], 4(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00810019;
- int const2 = 0x00000042;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00810042;
- int const2 = 0x00000019;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffb60070;
- int const2 = 0x0000ffda;
- int const3 = 0xffa2ffee;
- int const4 = 0x00000070;
- int const5 = 0x100;
-
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n"
- "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n"
- "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n"
- "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00420081;
- int const2 = 0x00190000;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffb60070;
- int const2 = 0x0000ffda;
- int const3 = 0xffa2ffee;
- int const4 = 0x00000070;
- int const5 = 0x100;
-
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_rgb0]) \n"
- "lw %[tmp_t2], 4(%[src_rgb0]) \n"
- "lw %[tmp_t3], 0(%[src_rgb1]) \n"
- "lw %[tmp_t4], 4(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-#endif // __mips_dsp_rev >= 2
-
-#endif // defined(__mips__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
deleted file mode 100644
index decd3d2e..00000000
--- a/files/source/row_gcc.cc
+++ /dev/null
@@ -1,6798 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-// Constants for ARGB
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
- 13, 65, 33, 0, 13, 65, 33, 0};
-
-// JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
- 15, 75, 38, 0, 15, 75, 38, 0};
-#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
- 112, -74, -38, 0, 112, -74, -38, 0};
-
-static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
- 127, -84, -43, 0, 127, -84, -43, 0};
-
-static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
- -18, -94, 112, 0, -18, -94, 112, 0};
-
-static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
- -20, -107, 127, 0, -20, -107, 127, 0};
-
-// Constants for BGRA
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
- 0, 33, 65, 13, 0, 33, 65, 13};
-
-static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
- 0, -38, -74, 112, 0, -38, -74, 112};
-
-static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
- 0, 112, -94, -18, 0, 112, -94, -18};
-
-// Constants for ABGR
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
- 33, 65, 13, 0, 33, 65, 13, 0};
-
-static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
- -38, -74, 112, 0, -38, -74, 112, 0};
-
-static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
- 112, -94, -18, 0, 112, -94, -18, 0};
-
-// Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
- 0, 13, 65, 33, 0, 13, 65, 33};
-
-static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
- 0, 112, -74, -38, 0, 112, -74, -38};
-
-static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
- 0, -18, -94, 112, 0, -18, -94, 112};
-
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
-#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
- 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
-
-// Shuffle table for converting RAW to RGB24. First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24. Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24. Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
- 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
- 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
- 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
- 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
- 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
- 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
- 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
- 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-#endif // HAS_RGB24TOARGBROW_SSSE3
-
-#ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif // HAS_J400TOARGBROW_SSE2
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRGB24ToARGB) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRAWToARGB) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
- uint8_t* dst_rgb24,
- int width) {
- asm volatile(
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x4(%0),%%xmm1 \n"
- "movdqu 0x8(%0),%%xmm2 \n"
- "lea 0x18(%0),%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRAWToRGB24_0), // %3
- "m"(kShuffleMaskRAWToRGB24_1), // %4
- "m"(kShuffleMaskRAWToRGB24_2) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
- "xmm6", "xmm7");
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
- "xmm6", "xmm7");
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,0x00(%1,%0,2) \n"
- "movdqu %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
-
- "movdqa %3,%%xmm6 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskARGBToRGB24) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
-
- "movdqa %3,%%xmm6 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskARGBToRAW) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#ifdef HAS_ARGBTORGB24ROW_AVX2
-// vpermd for 12+12 to 24
-static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
-
-void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskARGBToRGB24), // %3
- "m"(kPermdRGB24_AVX) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif
-
-#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
-// Shuffle table for converting ARGBToRGB24
-static const ulvec8 kPermARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
- 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
- 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
-static const ulvec8 kPermARGBToRGB24_1 = {
- 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
- 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
- 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
-static const ulvec8 kPermARGBToRGB24_2 = {
- 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
- 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
- 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
-
-void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vmovdqa %3,%%ymm5 \n"
- "vmovdqa %4,%%ymm6 \n"
- "vmovdqa %5,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
- "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
- "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kPermARGBToRGB24_0), // %3
- "m"(kPermARGBToRGB24_1), // %4
- "m"(kPermARGBToRGB24_2) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
-}
-#endif
-
-#ifdef HAS_ARGBTORAWROW_AVX2
-void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskARGBToRAW), // %3
- "m"(kPermdRGB24_AVX) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif
-
-void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
- uint8_t* dst,
- const uint32_t dither4,
- int width) {
- asm volatile(
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(dither4) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
- uint8_t* dst,
- const uint32_t dither4,
- int width) {
- asm volatile(
- "vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(dither4) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBTORGB565DITHERROW_AVX2
-
-void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif // HAS_RGB24TOARGBROW_SSSE3
-
-/*
-
-ARGBToAR30Row:
-
-Red Blue
-With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
-produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
-wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
-(1024+4)*16 for red.
-
-Alpha Green
-Alpha and Green are already in the high bits so vpand can zero out the other
-bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
-could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
-would be a simple multiplier to shift it into position. It wants a gap of 10
-above the green. Green is 10 bits, so there are 6 bits in the low short. 4
-more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
-and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
-result left 10 to position the A and G channels.
-*/
-
-// Shuffle table for converting RAW to RGB24. Last 8.
-static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
- 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
-
-static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
- 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
-
-static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
-static const uint32_t kMaskRB10 = 0x3ff003ff;
-static const uint32_t kMaskAG10 = 0xc000ff00;
-static const uint32_t kMulAG10 = 64 * 65536 + 1028;
-
-void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
-
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleRB30), // %3
- "m"(kMulRB10), // %4
- "m"(kMaskRB10), // %5
- "m"(kMaskAG10), // %6
- "m"(kMulAG10) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
-
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleBR30), // %3 reversed shuffler
- "m"(kMulRB10), // %4
- "m"(kMaskRB10), // %5
- "m"(kMaskAG10), // %6
- "m"(kMulAG10) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#ifdef HAS_ARGBTOAR30ROW_AVX2
-void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
-
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleRB30), // %3
- "m"(kMulRB10), // %4
- "m"(kMaskRB10), // %5
- "m"(kMaskAG10), // %6
- "m"(kMulAG10) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_ABGRTOAR30ROW_AVX2
-void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
-
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleBR30), // %3 reversed shuffler
- "m"(kMulRB10), // %4
- "m"(kMaskRB10), // %5
- "m"(kMaskAG10), // %6
- "m"(kMulAG10) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_ARGBTOYJROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBTOYJROW_SSSE3
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm4 \n"
- "vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16), // %4
- "m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif // HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm4 \n"
- "vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
- "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64), // %4
- "m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif // HAS_ARGBTOYJROW_AVX2
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kARGBToV), // %5
- "m"(kARGBToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-#endif // HAS_ARGBTOUVROW_SSSE3
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vbroadcastf128 %5,%%ymm5 \n"
- "vbroadcastf128 %6,%%ymm6 \n"
- "vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
-
- "vextractf128 $0x0,%%ymm0,(%1) \n"
- "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUV128), // %5
- "m"(kARGBToV), // %6
- "m"(kARGBToU), // %7
- "m"(kShufARGBToUV_AVX) // %8
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vbroadcastf128 %5,%%ymm5 \n"
- "vbroadcastf128 %6,%%ymm6 \n"
- "vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
-
- "vextractf128 $0x0,%%ymm0,(%1) \n"
- "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUVJ128), // %5
- "m"(kARGBToVJ), // %6
- "m"(kARGBToUJ), // %7
- "m"(kShufARGBToUV_AVX) // %8
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBTOUVJROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kARGBToVJ), // %5
- "m"(kARGBToUJ), // %6
- "m"(kAddUVJ128) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-#endif // HAS_ARGBTOUVJROW_SSSE3
-
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movdqa %4,%%xmm3 \n"
- "movdqa %5,%%xmm4 \n"
- "movdqa %6,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqu %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "m"(kARGBToV), // %4
- "m"(kARGBToU), // %5
- "m"(kAddUV128) // %6
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
-}
-#endif // HAS_ARGBTOUV444ROW_SSSE3
-
-void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
- asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
- int src_stride_bgra,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)), // %4
- "m"(kBGRAToV), // %5
- "m"(kBGRAToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
- asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
- asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
- int src_stride_abgr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)), // %4
- "m"(kABGRToV), // %5
- "m"(kABGRToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
- int src_stride_rgba,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba)), // %4
- "m"(kRGBAToV), // %5
- "m"(kRGBAToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
-
-// Read 8 UV from 444
-#define READYUV444 \
- "movq (%[u_buf]),%%xmm0 \n" \
- "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x8(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "movq (%[y_buf]),%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n"
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422 \
- "movd (%[u_buf]),%%xmm0 \n" \
- "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x4(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq (%[y_buf]),%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n"
-
-// Read 4 UV from 422 10 bit, upsample to 8 UV
-// TODO(fbarchard): Consider shufb to replace pack/unpack
-// TODO(fbarchard): Consider pmulhuw to replace psraw
-// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
-#define READYUV210 \
- "movq (%[u_buf]),%%xmm0 \n" \
- "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x8(%[u_buf]),%[u_buf] \n" \
- "punpcklwd %%xmm1,%%xmm0 \n" \
- "psraw $0x2,%%xmm0 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movdqu (%[y_buf]),%%xmm4 \n" \
- "psllw $0x6,%%xmm4 \n" \
- "lea 0x10(%[y_buf]),%[y_buf] \n"
-
-// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- "movd (%[u_buf]),%%xmm0 \n" \
- "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x4(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq (%[y_buf]),%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n" \
- "movq (%[a_buf]),%%xmm5 \n" \
- "lea 0x8(%[a_buf]),%[a_buf] \n"
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12 \
- "movq (%[uv_buf]),%%xmm0 \n" \
- "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq (%[y_buf]),%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n"
-
-// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21 \
- "movq (%[vu_buf]),%%xmm0 \n" \
- "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
- "pshufb %[kShuffleNV21], %%xmm0 \n" \
- "movq (%[y_buf]),%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n"
-
-// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2 \
- "movdqu (%[yuy2_buf]),%%xmm4 \n" \
- "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
- "movdqu (%[yuy2_buf]),%%xmm0 \n" \
- "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
- "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
-
-// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY \
- "movdqu (%[uyvy_buf]),%%xmm4 \n" \
- "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
- "movdqu (%[uyvy_buf]),%%xmm0 \n" \
- "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
- "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants) \
- "movdqa (%[yuvconstants]),%%xmm8 \n" \
- "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
- "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
- "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
- "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
- "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
- "movdqa 192(%[yuvconstants]),%%xmm14 \n"
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa %%xmm11,%%xmm0 \n" \
- "pmaddubsw %%xmm8,%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa %%xmm12,%%xmm1 \n" \
- "pmaddubsw %%xmm9,%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa %%xmm13,%%xmm2 \n" \
- "pmaddubsw %%xmm10,%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw %%xmm14,%%xmm4 \n" \
- "paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n"
-#define YUVTORGB_REGS \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-
-#else
-#define YUVTORGB_SETUP(yuvconstants)
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
- "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
- "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
- "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
- "paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n"
-#define YUVTORGB_REGS
-#endif
-
-#define YUVTORGB(yuvconstants) \
- YUVTORGB16(yuvconstants) \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
-
-// Store 8 ARGB values.
-#define STOREARGB \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklbw %%xmm5,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm1 \n" \
- "punpcklwd %%xmm2,%%xmm0 \n" \
- "punpckhwd %%xmm2,%%xmm1 \n" \
- "movdqu %%xmm0,(%[dst_argb]) \n" \
- "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
- "lea 0x20(%[dst_argb]), %[dst_argb] \n"
-
-// Store 8 RGBA values.
-#define STORERGBA \
- "pcmpeqb %%xmm5,%%xmm5 \n" \
- "punpcklbw %%xmm2,%%xmm1 \n" \
- "punpcklbw %%xmm0,%%xmm5 \n" \
- "movdqa %%xmm5,%%xmm0 \n" \
- "punpcklwd %%xmm1,%%xmm5 \n" \
- "punpckhwd %%xmm1,%%xmm0 \n" \
- "movdqu %%xmm5,(%[dst_rgba]) \n" \
- "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
- "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
-
-// Store 8 AR30 values.
-#define STOREAR30 \
- "psraw $0x4,%%xmm0 \n" \
- "psraw $0x4,%%xmm1 \n" \
- "psraw $0x4,%%xmm2 \n" \
- "pminsw %%xmm7,%%xmm0 \n" \
- "pminsw %%xmm7,%%xmm1 \n" \
- "pminsw %%xmm7,%%xmm2 \n" \
- "pmaxsw %%xmm6,%%xmm0 \n" \
- "pmaxsw %%xmm6,%%xmm1 \n" \
- "pmaxsw %%xmm6,%%xmm2 \n" \
- "psllw $0x4,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "punpcklwd %%xmm2,%%xmm0 \n" \
- "punpckhwd %%xmm2,%%xmm3 \n" \
- "movdqa %%xmm1,%%xmm2 \n" \
- "punpcklwd %%xmm5,%%xmm1 \n" \
- "punpckhwd %%xmm5,%%xmm2 \n" \
- "pslld $0xa,%%xmm1 \n" \
- "pslld $0xa,%%xmm2 \n" \
- "por %%xmm1,%%xmm0 \n" \
- "por %%xmm2,%%xmm3 \n" \
- "movdqu %%xmm0,(%[dst_ar30]) \n" \
- "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
- "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV444
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-
-void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- "sub %[u_buf],%[v_buf] \n"
-
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB(yuvconstants)
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
- "subl $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
-#if defined(__i386__)
- [width]"+m"(width) // %[width]
-#else
- [width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
- [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-
-void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_ar30,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
-
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB16(yuvconstants)
- STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-
-// 10 bit YUV to ARGB
-void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
- const uint16_t* u_buf,
- const uint16_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV210
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-
-// 10 bit YUV to AR30
-void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
- const uint16_t* u_buf,
- const uint16_t* v_buf,
- uint8_t* dst_ar30,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
-
- LABELALIGN
- "1: \n"
- READYUV210
- YUVTORGB16(yuvconstants)
- STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- const uint8_t* a_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
-
- LABELALIGN
- "1: \n"
- READYUVA422
- YUVTORGB(yuvconstants)
- STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [a_buf]"+r"(a_buf), // %[a_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
-#if defined(__i386__)
- [width]"+m"(width) // %[width]
-#else
- [width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-#endif // HAS_I422ALPHATOARGBROW_SSSE3
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* uv_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READNV12
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* vu_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READNV21
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [vu_buf]"+r"(vu_buf), // %[vu_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleNV21]"m"(kShuffleNV21)
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUY2
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
- [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-
-void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READUYVY
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleUYVYY]"m"(kShuffleUYVYY),
- [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-
-void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB(yuvconstants)
- STORERGBA
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-
-#endif // HAS_I422TOARGBROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2 \
- "vmovdqu (%[u_buf]),%%xmm0 \n" \
- "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x10(%[u_buf]),%[u_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vmovdqu (%[y_buf]),%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea 0x10(%[y_buf]),%[y_buf] \n"
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 \
- "vmovq (%[u_buf]),%%xmm0 \n" \
- "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x8(%[u_buf]),%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu (%[y_buf]),%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea 0x10(%[y_buf]),%[y_buf] \n"
-
-// Read 8 UV from 210 10 bit, upsample to 16 UV
-// TODO(fbarchard): Consider vshufb to replace pack/unpack
-// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
-#define READYUV210_AVX2 \
- "vmovdqu (%[u_buf]),%%xmm0 \n" \
- "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x10(%[u_buf]),%[u_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
- "lea 0x20(%[y_buf]),%[y_buf] \n"
-
-// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
-#define READYUVA422_AVX2 \
- "vmovq (%[u_buf]),%%xmm0 \n" \
- "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x8(%[u_buf]),%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu (%[y_buf]),%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea 0x10(%[y_buf]),%[y_buf] \n" \
- "vmovdqu (%[a_buf]),%%xmm5 \n" \
- "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
- "lea 0x10(%[a_buf]),%[a_buf] \n"
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
- "vmovdqu (%[uv_buf]),%%xmm0 \n" \
- "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu (%[y_buf]),%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea 0x10(%[y_buf]),%[y_buf] \n"
-
-// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
- "vmovdqu (%[vu_buf]),%%xmm0 \n" \
- "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
- "vmovdqu (%[y_buf]),%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea 0x10(%[y_buf]),%[y_buf] \n"
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
- "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
- "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
- "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
- "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
- "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
- "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
- "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
- "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
- "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
- "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants) \
- "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
- "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
- "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
- "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
- "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
- "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
- "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
-
-#define YUVTORGB16_AVX2(yuvconstants) \
- "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
- "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
- "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
- "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
- "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
- "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
- "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
- "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
-
-#define YUVTORGB_REGS_AVX2 \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-
-#else // Convert 16 pixels: 16 UV and 16 Y.
-
-#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB16_AVX2(yuvconstants) \
- "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
- "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
- "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
- "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
- "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
- "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
- "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
- "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
- "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
- "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
- "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
-#define YUVTORGB_REGS_AVX2
-#endif
-
-#define YUVTORGB_AVX2(yuvconstants) \
- YUVTORGB16_AVX2(yuvconstants) \
- "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
- "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2 \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
- "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
- "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
- "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
- "vmovdqu %%ymm1,(%[dst_argb]) \n" \
- "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
- "lea 0x40(%[dst_argb]), %[dst_argb] \n"
-
-// Store 16 AR30 values.
-#define STOREAR30_AVX2 \
- "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
- "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
- "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
- "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
- "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
- "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
- "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
- "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
- "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
- "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
- "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
- "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
- "vpslld $0xa,%%ymm1,%%ymm1 \n" \
- "vpslld $0xa,%%ymm2,%%ymm2 \n" \
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
- "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
- "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
- "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV444_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_I444TOARGBROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV422_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
-
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_I422TOARGBROW_AVX2
-
-#if defined(HAS_I422TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_ar30,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- READYUV422_AVX2
- YUVTORGB16_AVX2(yuvconstants)
- STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
-
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-#endif // HAS_I422TOAR30ROW_AVX2
-
-#if defined(HAS_I210TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
- const uint16_t* u_buf,
- const uint16_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV210_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
-
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_I210TOARGBROW_AVX2
-
-#if defined(HAS_I210TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
- const uint16_t* u_buf,
- const uint16_t* v_buf,
- uint8_t* dst_ar30,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- READYUV210_AVX2
- YUVTORGB16_AVX2(yuvconstants)
- STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
-
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_I210TOAR30ROW_AVX2
-
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- const uint8_t* a_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
-
- LABELALIGN
- "1: \n"
- READYUVA422_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [a_buf]"+r"(a_buf), // %[a_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
-#if defined(__i386__)
- [width]"+m"(width) // %[width]
-#else
- [width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-#endif // HAS_I422ALPHATOARGBROW_AVX2
-
-#if defined(HAS_I422TORGBAROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUV422_AVX2
- YUVTORGB_AVX2(yuvconstants)
-
- // Step 3: Weave into RGBA
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
- "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
- "vmovdqu %%ymm0,(%[dst_argb]) \n"
- "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
- "lea 0x40(%[dst_argb]),%[dst_argb] \n"
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_NV12TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
- const uint8_t* uv_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READNV12_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-#endif // HAS_NV12TOARGBROW_AVX2
-
-#if defined(HAS_NV21TOARGBROW_AVX2)
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
- const uint8_t* vu_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READNV21_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [vu_buf]"+r"(vu_buf), // %[vu_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleNV21]"m"(kShuffleNV21)
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-#endif // HAS_NV21TOARGBROW_AVX2
-
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READYUY2_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
- [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-#endif // HAS_YUY2TOARGBROW_AVX2
-
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- // clang-format off
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- READUYVY_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleUYVYY]"m"(kShuffleUYVYY),
- [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
- : "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
- // clang-format on
-}
-#endif // HAS_UYVYTOARGBROW_AVX2
-
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
- asm volatile(
- "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "psrlw $6, %%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
-
- // Step 2: Weave into ARGB
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "por %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
-
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
- asm volatile(
- "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "vmovd %%eax,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
- "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
- "vmovd %%eax,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpslld $0x18,%%ymm4,%%ymm4 \n"
-
- LABELALIGN
- "1: \n"
- // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
- "vmovdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-
-void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile(
-
- "movdqa %3,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu -0x10(%0,%2,1),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kShuffleMirror) // %3
- : "memory", "cc", "xmm0", "xmm5");
-}
-#endif // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile(
-
- "vbroadcastf128 %3,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kShuffleMirror) // %3
- : "memory", "cc", "xmm0", "xmm5");
-}
-#endif // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile(
- "movdqa %4,%%xmm1 \n"
- "lea -0x10(%0,%3,2),%0 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $8,%3 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_MIRRORUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-
-void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile(
-
- "lea -0x10(%0,%2,4),%0 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufd $0x1b,%%xmm0,%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- :
- : "memory", "cc", "xmm0");
-}
-#endif // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile(
-
- "vmovdqu %3,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kARGBShuffleMirror_AVX2) // %3
- : "memory", "cc", "xmm0", "xmm5");
-}
-#endif // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm2,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- asm volatile(
-
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- asm volatile(
-
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_MERGEUVROW_SSE2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
-#ifdef HAS_MERGEUVROW_16_AVX2
-void MergeUVRow_16_AVX2(const uint16_t* src_u,
- const uint16_t* src_v,
- uint16_t* dst_uv,
- int scale,
- int width) {
- // clang-format off
- asm volatile (
- "vmovd %4,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
-
- // 16 pixels per loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu (%0,%1,1),%%ymm1 \n"
- "add $0x20,%0 \n"
-
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
- "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "add $0x40,%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- : "r"(scale) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
- // clang-format on
-}
-#endif // HAS_MERGEUVROW_AVX2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
-#ifdef HAS_MULTIPLYROW_16_AVX2
-void MultiplyRow_16_AVX2(const uint16_t* src_y,
- uint16_t* dst_y,
- int scale,
- int width) {
- // clang-format off
- asm volatile (
- "vmovd %3,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
-
- // 16 pixels per loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%0,%1) \n"
- "vmovdqu %%ymm1,0x20(%0,%1) \n"
- "add $0x40,%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_y), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "r"(scale) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm3");
- // clang-format on
-}
-#endif // HAS_MULTIPLYROW_16_AVX2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 32768 = 9 bits
-// 16384 = 10 bits
-// 4096 = 12 bits
-// 256 = 16 bits
-void Convert16To8Row_SSSE3(const uint16_t* src_y,
- uint8_t* dst_y,
- int scale,
- int width) {
- // clang-format off
- asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
-
- // 32 pixels per loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "add $0x20,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "r"(scale) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
- // clang-format on
-}
-
-#ifdef HAS_CONVERT16TO8ROW_AVX2
-void Convert16To8Row_AVX2(const uint16_t* src_y,
- uint8_t* dst_y,
- int scale,
- int width) {
- // clang-format off
- asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
-
- // 32 pixels per loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "add $0x40,%0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_y), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "r"(scale) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
- // clang-format on
-}
-#endif // HAS_CONVERT16TO8ROW_AVX2
-
-// Use scale to convert to lsb formats depending how many bits there are:
-// 512 = 9 bits
-// 1024 = 10 bits
-// 4096 = 12 bits
-// TODO(fbarchard): reduce to SSE2
-void Convert8To16Row_SSE2(const uint8_t* src_y,
- uint16_t* dst_y,
- int scale,
- int width) {
- // clang-format off
- asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
-
- // 32 pixels per loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "add $0x10,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "r"(scale) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
- // clang-format on
-}
-
-#ifdef HAS_CONVERT8TO16ROW_AVX2
-void Convert8To16Row_AVX2(const uint8_t* src_y,
- uint16_t* dst_y,
- int scale,
- int width) {
- // clang-format off
- asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
-
- // 32 pixels per loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "add $0x40,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_y), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "r"(scale) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
- // clang-format on
-}
-#endif // HAS_CONVERT8TO16ROW_AVX2
-
-#ifdef HAS_SPLITRGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
- 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
- 2u, 5u, 8u, 11u, 14u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 1u,
- 4u, 7u, 10u, 13u};
-
-static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
- 3u, 6u, 9u, 12u, 15u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 2u,
- 5u, 8u, 11u, 14u};
-
-static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
- 4u, 7u, 10u, 13u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 0u, 3u,
- 6u, 9u, 12u, 15u};
-
-void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
- uint8_t* dst_r,
- uint8_t* dst_g,
- uint8_t* dst_b,
- int width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "lea 0x30(%0),%0 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_rgb), // %0
- "+r"(dst_r), // %1
- "+r"(dst_g), // %2
- "+r"(dst_b), // %3
- "+r"(width) // %4
- : "m"(kShuffleMaskRGBToR0), // %5
- "m"(kShuffleMaskRGBToR1), // %6
- "m"(kShuffleMaskRGBToR2), // %7
- "m"(kShuffleMaskRGBToG0), // %8
- "m"(kShuffleMaskRGBToG1), // %9
- "m"(kShuffleMaskRGBToG2), // %10
- "m"(kShuffleMaskRGBToB0), // %11
- "m"(kShuffleMaskRGBToB1), // %12
- "m"(kShuffleMaskRGBToB2) // %13
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_SPLITRGBROW_SSSE3
-
-#ifdef HAS_MERGERGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
- 2u, 128u, 128u, 3u, 128u, 128u,
- 4u, 128u, 128u, 5u};
-static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
- 128u, 2u, 128u, 128u, 3u, 128u,
- 128u, 4u, 128u, 128u};
-static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
- 128u, 128u, 2u, 128u, 128u, 3u,
- 128u, 128u, 4u, 128u};
-
-static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
- 7u, 128u, 128u, 8u, 128u, 128u,
- 9u, 128u, 128u, 10u};
-static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
- 128u, 7u, 128u, 128u, 8u, 128u,
- 128u, 9u, 128u, 128u};
-static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
- 128u, 128u, 8u, 128u, 128u, 9u,
- 128u, 128u, 10u, 128u};
-
-static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
- 12u, 128u, 128u, 13u, 128u, 128u,
- 14u, 128u, 128u, 15u};
-static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
- 128u, 13u, 128u, 128u, 14u, 128u,
- 128u, 15u, 128u, 128u};
-static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
- 128u, 128u, 13u, 128u, 128u, 14u,
- 128u, 128u, 15u, 128u};
-
-void MergeRGBRow_SSSE3(const uint8_t* src_r,
- const uint8_t* src_g,
- const uint8_t* src_b,
- uint8_t* dst_rgb,
- int width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,16(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,32(%3) \n"
-
- "lea 0x10(%0),%0 \n"
- "lea 0x10(%1),%1 \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x30(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_r), // %0
- "+r"(src_g), // %1
- "+r"(src_b), // %2
- "+r"(dst_rgb), // %3
- "+r"(width) // %4
- : "m"(kShuffleMaskRToRGB0), // %5
- "m"(kShuffleMaskGToRGB0), // %6
- "m"(kShuffleMaskBToRGB0), // %7
- "m"(kShuffleMaskRToRGB1), // %8
- "m"(kShuffleMaskGToRGB1), // %9
- "m"(kShuffleMaskBToRGB1), // %10
- "m"(kShuffleMaskRToRGB2), // %11
- "m"(kShuffleMaskGToRGB2), // %12
- "m"(kShuffleMaskBToRGB2) // %13
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_MERGERGBROW_SSSE3
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
-
- LABELALIGN
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 9f \n"
-
- LABELALIGN
- "2: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 2b \n"
-
- LABELALIGN "9: \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_COPYROW_AVX
-
-#ifdef HAS_COPYROW_ERMS
-// Multiple of 1.
-void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
- size_t width_tmp = (size_t)(width);
- asm volatile(
-
- "rep movsb \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
- :
- : "memory", "cc");
-}
-#endif // HAS_COPYROW_ERMS
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "vmovdqu 0x20(%0),%%ymm2 \n"
- "lea 0x40(%0),%0 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0), %%xmm0 \n"
- "movdqu 0x10(%0), %%xmm1 \n"
- "lea 0x20(%0), %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1), %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-static const uvec8 kShuffleAlphaShort_AVX2 = {
- 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
- 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
-
-void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- asm volatile(
- "vmovdqa %3,%%ymm4 \n"
- "vbroadcastf128 %4,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0), %%ymm0 \n"
- "vmovdqu 0x20(%0), %%ymm1 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x40(%0), %%ymm2 \n"
- "vmovdqu 0x60(%0), %%ymm3 \n"
- "lea 0x80(%0), %0 \n"
- "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20, %2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+rm"(width) // %2
- : "m"(kPermdARGBToY_AVX), // %3
- "m"(kShuffleAlphaShort_AVX2) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm2 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
-
- LABELALIGN
- "1: \n"
- "vpmovzxbd (%0),%%ymm1 \n"
- "vpmovzxbd 0x8(%0),%%ymm2 \n"
- "lea 0x10(%0),%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
- size_t width_tmp = (size_t)(width >> 2);
- const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
- asm volatile(
-
- "rep stosl \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
-}
-
-void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
- size_t width_tmp = (size_t)(width);
- asm volatile(
-
- "rep stosb \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v8) // %2
- : "memory", "cc");
-}
-
-void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
- size_t width_tmp = (size_t)(width);
- asm volatile(
-
- "rep stosl \n"
- : "+D"(dst_argb), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
-}
-#endif // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
- int stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-
-void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
- int stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
- int stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1,(%1) \n"
- "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1,(%1) \n"
- "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
- int stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1,(%1) \n"
- "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1,(%1) \n"
- "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
-
-// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
-
- // 4 pixel loop.
- LABELALIGN
- "40: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
-
- // 1 pixel loop.
- "91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 91b \n"
- "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "m"(kShuffleAlpha) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8_t* src0,
- const uint8_t* src1,
- const uint8_t* alpha,
- uint8_t* dst,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(alpha), // %2
- "+r"(dst), // %3
- "+rm"(width) // %4
- ::"memory",
- "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
-}
-#endif // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8_t* src0,
- const uint8_t* src1,
- const uint8_t* alpha,
- uint8_t* dst,
- int width) {
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
- "vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
- "vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
-
- // 32 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(alpha), // %2
- "+r"(dst), // %3
- "+rm"(width) // %4
- ::"memory",
- "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
- 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
-static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
-// Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAlpha0), // %3
- "m"(kShuffleAlpha1) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
- 128u, 128u, 14u, 15u, 14u, 15u,
- 14u, 15u, 128u, 128u};
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vbroadcastf128 %3,%%ymm4 \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpslld $0x18,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm6,%%ymm6 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpor %%ymm6,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAlpha_AVX2) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- uintptr_t alpha;
- asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movzb 0x03(%0),%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x07(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "=&r"(alpha) // %3
- : "r"(fixed_invtbl8) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
-// Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- uintptr_t alpha;
- asm volatile(
- "sub %0,%1 \n"
- "vbroadcastf128 %5,%%ymm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- // replace VPGATHER
- "movzb 0x03(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x07(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "movzb 0x13(%0),%3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x17(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x1b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x1f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
- "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
- "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
- "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
- // end of VPGATHER
-
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
- "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "=&r"(alpha) // %3
- : "r"(fixed_invtbl8), // %4
- "m"(kUnattenShuffleAlpha_AVX2) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBUNATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
- asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
- 17, 68, 35, 0, 17, 68, 35, 0};
-
-static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
- 22, 88, 45, 0, 22, 88, 45, 0};
-
-static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
- 24, 98, 50, 0, 24, 98, 50, 0};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
- asm volatile(
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%1 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "m"(kARGBToSepiaB), // %2
- "m"(kARGBToSepiaG), // %3
- "m"(kARGBToSepiaR) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const int8_t* matrix_argb,
- int width) {
- asm volatile(
- "movdqu (%3),%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm6,0x10(%1) \n"
- "lea 0x20(%0),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
- int scale,
- int interval_size,
- int interval_offset,
- int width) {
- asm volatile(
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqu (%0),%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqu %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x4,%1 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- uint32_t value) {
- asm volatile(
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
-
- "pxor %%xmm5,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
-
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu (%1),%%ymm3 \n"
- "lea 0x20(%1),%1 \n"
- "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__AVX2__)
- ,
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpaddusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0");
-}
-#endif // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpsubusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0");
-}
-#endif // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-void SobelXRow_SSE2(const uint8_t* src_y0,
- const uint8_t* src_y1,
- const uint8_t* src_y2,
- uint8_t* dst_sobelx,
- int width) {
- asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x2(%0),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "movq 0x02(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x00(%0,%2,1),%%xmm2 \n"
- "movq 0x02(%0,%2,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%3,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-void SobelYRow_SSE2(const uint8_t* src_y0,
- const uint8_t* src_y1,
- uint8_t* dst_sobely,
- int width) {
- asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x1(%0),%%xmm1 \n"
- "movq 0x01(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x2(%0),%%xmm2 \n"
- "movq 0x02(%0,%1,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%2,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_SSE2(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqu %%xmm1,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "movdqu %%xmm3,0x20(%2) \n"
- "movdqu %%xmm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1");
-}
-#endif // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_SSE2(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm6,(%2) \n"
- "movdqu %%xmm4,0x10(%2) \n"
- "movdqu %%xmm7,0x20(%2) \n"
- "movdqu %%xmm1,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
- int32_t* cumsum,
- const int32_t* previous_cumsum,
- int width) {
- asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
-
- // 4 pixel loop.
- LABELALIGN
- "40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqu 0x10(%2),%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqu 0x20(%2),%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqu 0x30(%2),%%xmm5 \n"
- "lea 0x40(%2),%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "movdqu %%xmm4,0x20(%1) \n"
- "movdqu %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
-
- // 1 pixel loop.
- LABELALIGN
- "10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "lea 0x10(%2),%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
-
- "19: \n"
- : "+r"(row), // %0
- "+r"(cumsum), // %1
- "+r"(previous_cumsum), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
- const int32_t* botleft,
- int width,
- int area,
- uint8_t* dst,
- int count) {
- asm volatile(
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
-
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
-
- // 4 pixel small loop.
- LABELALIGN
- "4: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
-
- // 4 pixel loop
- LABELALIGN
- "40: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
-
- // 1 pixel loop
- LABELALIGN
- "10: \n"
- "movdqu (%0),%%xmm0 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(topleft), // %0
- "+r"(botleft), // %1
- "+r"(dst), // %2
- "+rm"(count) // %3
- : "r"((intptr_t)(width)), // %4
- "rm"(area) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8_t* src_argb,
- int src_argb_stride,
- uint8_t* dst_argb,
- const float* src_dudv,
- int width) {
- intptr_t src_argb_stride_temp = src_argb_stride;
- intptr_t temp;
- asm volatile(
- "movq (%3),%%xmm2 \n"
- "movq 0x08(%3),%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
-
- // 4 pixel loop
- LABELALIGN
- "40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
-
- // 1 pixel loop
- LABELALIGN
- "10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x04(%2),%2 \n"
- "sub $0x1,%4 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(src_argb), // %0
- "+r"(src_argb_stride_temp), // %1
- "+r"(dst_argb), // %2
- "+r"(src_dudv), // %3
- "+rm"(width), // %4
- "=&r"(temp) // %5
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- asm volatile(
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm2,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+rm"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- asm volatile(
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "sub %1,%0 \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "vmovd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "vmovd %3,%%xmm5 \n"
- "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
- "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
- "vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
- "vbroadcastss %%xmm4,%%ymm4 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "rep movsb \n"
- "jmp 999f \n"
-
- "99: \n"
- "vzeroupper \n"
- "999: \n"
- : "+D"(dst_ptr), // %0
- "+S"(src_ptr), // %1
- "+cm"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
-}
-#endif // HAS_INTERPOLATEROW_AVX2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- asm volatile(
-
- "movdqu (%3),%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif // HAS_ARGBSHUFFLEROW_SSSE3
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- asm volatile(
-
- "vbroadcastf128 (%3),%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif // HAS_ARGBSHUFFLEROW_AVX2
-
-#ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_yuy2,
- int width) {
- asm volatile(
-
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "add $0x10,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_I422TOYUY2ROW_SSE2
-
-#ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uyvy,
- int width) {
- asm volatile(
-
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "add $0x10,%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,(%3) \n"
- "movdqu %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_I422TOUYVYROW_SSE2
-
-#ifdef HAS_I422TOYUY2ROW_AVX2
-void I422ToYUY2Row_AVX2(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_yuy2,
- int width) {
- asm volatile(
-
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
- "vextractf128 $0x0,%%ymm1,(%3) \n"
- "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
- "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
- "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_I422TOYUY2ROW_AVX2
-
-#ifdef HAS_I422TOUYVYROW_AVX2
-void I422ToUYVYRow_AVX2(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uyvy,
- int width) {
- asm volatile(
-
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
- "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
- "vextractf128 $0x0,%%ymm1,(%3) \n"
- "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
- "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
- "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif // HAS_I422TOUYVYROW_AVX2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const float* poly,
- int width) {
- asm volatile(
-
- "pxor %%xmm3,%%xmm3 \n"
-
- // 2 pixel loop.
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps 0x10(%3),%%xmm0 \n"
- "mulps 0x10(%3),%%xmm4 \n"
- "addps (%3),%%xmm0 \n"
- "addps (%3),%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps 0x20(%3),%%xmm2 \n"
- "mulps 0x20(%3),%%xmm6 \n"
- "mulps 0x30(%3),%%xmm1 \n"
- "mulps 0x30(%3),%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(poly) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const float* poly,
- int width) {
- asm volatile(
- "vbroadcastf128 (%3),%%ymm4 \n"
- "vbroadcastf128 0x10(%3),%%ymm5 \n"
- "vbroadcastf128 0x20(%3),%%ymm6 \n"
- "vbroadcastf128 0x30(%3),%%ymm7 \n"
-
- // 2 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
- "lea 0x8(%0),%0 \n"
- "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
- "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
- "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
- "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
- "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
- "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
- // X
- "vcvttps2dq %%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
- "vmovq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(poly) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-#endif // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_SSE2
-static float kScaleBias = 1.9259299444e-34f;
-void HalfFloatRow_SSE2(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- scale *= kScaleBias;
- asm volatile(
- "movd %3,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
-
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm2 \n" // 8 shorts
- "add $0x10,%0 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
- "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
- "punpckhwd %%xmm5,%%xmm3 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "psrld $0xd,%%xmm2 \n"
- "psrld $0xd,%%xmm3 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "movdqu %%xmm2,-0x10(%0,%1,1) \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(scale) // %3
- : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_HALFFLOATROW_SSE2
-
-#ifdef HAS_HALFFLOATROW_AVX2
-void HalfFloatRow_AVX2(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- scale *= kScaleBias;
- asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
-
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm2 \n" // 16 shorts
- "add $0x20,%0 \n"
- "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
- "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vpsrld $0xd,%%ymm3,%%ymm3 \n"
- "vpsrld $0xd,%%ymm2,%%ymm2 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
- "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
-
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
-#if defined(__x86_64__)
- : "x"(scale) // %3
-#else
- : "m"(scale) // %3
-#endif
- : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_HALFFLOATROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloatRow_F16C(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "sub %0,%1 \n"
-
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
- "vpmovzxwd 0x10(%0),%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
- "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
- "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
- "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
- "add $0x20,%0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
-#if defined(__x86_64__)
- : "x"(scale) // %3
-#else
- : "m"(scale) // %3
-#endif
- : "memory", "cc", "xmm2", "xmm3", "xmm4");
-}
-#endif // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
- asm volatile(
- "sub %0,%1 \n"
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
- "vpmovzxwd 0x10(%0),%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
- "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
- "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
- "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
- "add $0x20,%0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "xmm2", "xmm3");
-}
-#endif // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8_t* dst_argb,
- const uint8_t* table_argb,
- int width) {
- uintptr_t pixel_temp;
- asm volatile(
- // 1 pixel loop.
- LABELALIGN
- "1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "movzb -0x1(%0),%1 \n"
- "movzb 0x03(%3,%1,4),%1 \n"
- "mov %b1,-0x1(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "=&d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
- : "memory", "cc");
-}
-#endif // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8_t* dst_argb,
- const uint8_t* table_argb,
- int width) {
- uintptr_t pixel_temp;
- asm volatile(
- // 1 pixel loop.
- LABELALIGN
- "1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "=&d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
- : "memory", "cc");
-}
-#endif // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- const uint8_t* luma,
- uint32_t lumacoeff) {
- uintptr_t pixel_temp;
- uintptr_t table_temp;
- asm volatile(
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%2),%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb (%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,(%3) \n"
- "movzb 0x1(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x1(%3) \n"
- "movzb 0x2(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x2(%3) \n"
- "movzb 0x3(%2),%0 \n"
- "mov %b0,0x3(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x4(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x4(%3) \n"
- "movzb 0x5(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x5(%3) \n"
- "movzb 0x6(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x6(%3) \n"
- "movzb 0x7(%2),%0 \n"
- "mov %b0,0x7(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x8(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x8(%3) \n"
- "movzb 0x9(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x9(%3) \n"
- "movzb 0xa(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xa(%3) \n"
- "movzb 0xb(%2),%0 \n"
- "mov %b0,0xb(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
-
- "movzb 0xc(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xc(%3) \n"
- "movzb 0xd(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xd(%3) \n"
- "movzb 0xe(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xe(%3) \n"
- "movzb 0xf(%2),%0 \n"
- "mov %b0,0xf(%3) \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x10(%3),%3 \n"
- "sub $0x4,%4 \n"
- "jg 1b \n"
- : "=&d"(pixel_temp), // %0
- "=&a"(table_temp), // %1
- "+r"(src_argb), // %2
- "+r"(dst_argb), // %3
- "+rm"(width) // %4
- : "r"(luma), // %5
- "rm"(lumacoeff) // %6
- : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-
-// begin NV21ToYUV24Row_C avx2 constants
-static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
- 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
- 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
- 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
-
-static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
- 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
- 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
- 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
-
-static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
- 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
- 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
- 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
-
-static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
- 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
- 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
- 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
-
-static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
- 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
- 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
- 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
-
-static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
- 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
- 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
- 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
-
-static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
- 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
- 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
- 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
-
-static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
- 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
- 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
- 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
-
-static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
- 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
- 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
- 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
-
-// NV21ToYUV24Row_AVX2
-void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_yuv24,
- int width) {
- uint8_t* src_y_ptr;
- uint64_t src_offset = 0;
- uint64_t width64;
-
- width64 = width;
- src_y_ptr = (uint8_t*)src_y;
-
- asm volatile(
- "vmovdqu %5, %%ymm0 \n" // init blend value
- "vmovdqu %6, %%ymm1 \n" // init blend value
- "vmovdqu %7, %%ymm2 \n" // init blend value
- // "sub $0x20, %3 \n" //sub 32 from width for final loop
-
- LABELALIGN
- "1: \n" // label 1
- "vmovdqu (%0,%4), %%ymm3 \n" // src_y
- "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
- "vmovdqu (%1), %%ymm5 \n" // src_uv
- "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
- "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
- // shuf
- "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
- // shuf
- "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
- "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
- // shuf
- "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
- "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
- "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
- "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
- "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
- "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
- "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
- "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
- "add $0x20, %4 \n" // add to src buffer
- // ptr
- "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
- "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
- "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
- "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
- "add $0x60,%2 \n" // add to dst buffer
- // ptr
- // "cmp %3, %4 \n" //(width64 -
- // 32 bytes) and src_offset
- "sub $0x20,%3 \n" // 32 pixels per loop
- "jg 1b \n"
- "vzeroupper \n" // sse-avx2
- // transistions
-
- : "+r"(src_y), //%0
- "+r"(src_vu), //%1
- "+r"(dst_yuv24), //%2
- "+r"(width64), //%3
- "+r"(src_offset) //%4
- : "m"(kBLEND0), //%5
- "m"(kBLEND1), //%6
- "m"(kBLEND2), //%7
- "m"(kSHUF0), //%8
- "m"(kSHUF1), //%9
- "m"(kSHUF2), //%10
- "m"(kSHUF3), //%11
- "m"(kSHUF4), //%12
- "m"(kSHUF5) //%13
- : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
- "xmm13", "xmm14", "xmm15");
-}
-#endif // HAS_NV21TOYUV24ROW_AVX2
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc
deleted file mode 100644
index d8726d09..00000000
--- a/files/source/row_mmi.cc
+++ /dev/null
@@ -1,6042 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "libyuv/row.h"
-
-#include <string.h> // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
- uint8_t* dst_argb,
- int width) {
- uint64_t src0, src1, dest;
- const uint64_t mask = 0xff000000ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
- "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
- "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
-
- "or %[src0], %[src0], %[mask] \n\t"
- "or %[src1], %[src1], %[mask] \n\t"
- "punpcklwd %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
- "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
- "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
- "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
-
- "or %[src0], %[src0], %[mask] \n\t"
- "or %[src1], %[src1], %[mask] \n\t"
- "punpcklwd %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
- [mask] "f"(mask)
- : "memory");
-}
-
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
- uint64_t src0, src1, dest;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0xff000000ULL;
- const uint64_t mask2 = 0xc6;
-
- __asm__ volatile(
- "1: \n\t"
- "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
- "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
- "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
-
- "or %[src0], %[src0], %[mask1] \n\t"
- "punpcklbh %[src0], %[src0], %[mask0] \n\t"
- "pshufh %[src0], %[src0], %[mask2] \n\t"
- "or %[src1], %[src1], %[mask1] \n\t"
- "punpcklbh %[src1], %[src1], %[mask0] \n\t"
- "pshufh %[src1], %[src1], %[mask2] \n\t"
- "packushb %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
- "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
- "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
- "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
-
- "or %[src0], %[src0], %[mask1] \n\t"
- "punpcklbh %[src0], %[src0], %[mask0] \n\t"
- "pshufh %[src0], %[src0], %[mask2] \n\t"
- "or %[src1], %[src1], %[mask1] \n\t"
- "punpcklbh %[src1], %[src1], %[mask0] \n\t"
- "pshufh %[src1], %[src1], %[mask2] \n\t"
- "packushb %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
- : "memory");
-}
-
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
- uint64_t src0, src1;
- uint64_t ftmp[4];
- uint64_t mask0 = 0xc6;
- uint64_t mask1 = 0x6c;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t"
- "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t"
- "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t"
-
- "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
- "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
- "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
- "punpcklbh %[src1], %[src1], %[zero] \n\t"
- "pextrh %[ftmp2], %[ftmp0], %[three] \n\t"
- "pextrh %[ftmp3], %[ftmp1], %[one] \n\t"
- "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
- "pextrh %[ftmp3], %[ftmp1], %[two] \n\t"
- "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
- "pshufh %[src1], %[src1], %[mask1] \n\t"
- "pextrh %[ftmp2], %[src1], %[zero] \n\t"
- "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
- "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t"
- "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
- "packushb %[src1], %[src1], %[zero] \n\t"
-
- "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t"
- "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t"
- "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t"
- "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t"
-
- "daddiu %[src_raw], %[src_raw], 0x0c \n\t"
- "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
- [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
- : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
- [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
- [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
- : "memory");
-}
-
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
- uint8_t* dst_argb,
- int width) {
- uint64_t ftmp[5];
- uint64_t c0 = 0x001f001f001f001f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t c2 = 0x0007000700070007;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g], %[src1], %[c2] \n\t"
- "psllh %[g], %[g], %[three] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "psrlh %[r], %[src1], %[three] \n\t"
- "psllh %[src0], %[b], %[three] \n\t"
- "psrlh %[src1], %[b], %[two] \n\t"
- "or %[b], %[src0], %[src1] \n\t"
- "psllh %[src0], %[g], %[two] \n\t"
- "psrlh %[src1], %[g], %[four] \n\t"
- "or %[g], %[src0], %[src1] \n\t"
- "psllh %[src0], %[r], %[three] \n\t"
- "psrlh %[src1], %[r], %[two] \n\t"
- "or %[r], %[src0], %[src1] \n\t"
- "packushb %[b], %[b], %[r] \n\t"
- "packushb %[g], %[g], %[c1] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklhw %[r], %[src0], %[src1] \n\t"
- "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
- "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
- "punpckhhw %[r], %[src0], %[src1] \n\t"
- "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
- "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
- "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t"
- "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
- [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
- : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
- [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
- [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
- [four] "f"(0x04)
- : "memory");
-}
-
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
- uint8_t* dst_argb,
- int width) {
- uint64_t ftmp[6];
- uint64_t c0 = 0x001f001f001f001f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t c2 = 0x0003000300030003;
- uint64_t c3 = 0x007c007c007c007c;
- uint64_t c4 = 0x0001000100010001;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g], %[src1], %[c2] \n\t"
- "psllh %[g], %[g], %[three] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "and %[r], %[src1], %[c3] \n\t"
- "psrlh %[r], %[r], %[two] \n\t"
- "psrlh %[a], %[src1], %[seven] \n\t"
- "psllh %[src0], %[b], %[three] \n\t"
- "psrlh %[src1], %[b], %[two] \n\t"
- "or %[b], %[src0], %[src1] \n\t"
- "psllh %[src0], %[g], %[three] \n\t"
- "psrlh %[src1], %[g], %[two] \n\t"
- "or %[g], %[src0], %[src1] \n\t"
- "psllh %[src0], %[r], %[three] \n\t"
- "psrlh %[src1], %[r], %[two] \n\t"
- "or %[r], %[src0], %[src1] \n\t"
- "xor %[a], %[a], %[c1] \n\t"
- "paddb %[a], %[a], %[c4] \n\t"
- "packushb %[b], %[b], %[r] \n\t"
- "packushb %[g], %[g], %[a] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklhw %[r], %[src0], %[src1] \n\t"
- "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
- "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
- "punpckhhw %[r], %[src0], %[src1] \n\t"
- "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
- "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
- "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t"
- "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
- [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
- : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
- [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
- [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
- [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
- : "memory");
-}
-
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
- uint8_t* dst_argb,
- int width) {
- uint64_t ftmp[6];
- uint64_t c0 = 0x000f000f000f000f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g], %[src0], %[four] \n\t"
- "and %[r], %[src1], %[c0] \n\t"
- "psrlh %[a], %[src1], %[four] \n\t"
- "psllh %[src0], %[b], %[four] \n\t"
- "or %[b], %[src0], %[b] \n\t"
- "psllh %[src0], %[g], %[four] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "psllh %[src0], %[r], %[four] \n\t"
- "or %[r], %[src0], %[r] \n\t"
- "psllh %[src0], %[a], %[four] \n\t"
- "or %[a], %[src0], %[a] \n\t"
- "packushb %[b], %[b], %[r] \n\t"
- "packushb %[g], %[g], %[a] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklhw %[r], %[src0], %[src1] \n\t"
- "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
- "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
- "punpckhhw %[r], %[src0], %[src1] \n\t"
- "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
- "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
- "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t"
- "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
- [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
- : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
- [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
- [four] "f"(0x04)
- : "memory");
-}
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
- uint64_t src;
-
- __asm__ volatile(
- "1: \n\t"
- "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
- "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t"
- "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t"
-
- "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t"
- "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t"
- "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t"
-
- "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t"
- "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t"
- "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t"
- "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t"
-
- "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t"
- "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t"
- "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t"
- "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
- : "memory");
-}
-
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
- uint64_t src0, src1;
- uint64_t ftmp[3];
- uint64_t mask0 = 0xc6;
- uint64_t mask1 = 0x18;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
-
- "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
- "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
- "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
- "punpcklbh %[ftmp2], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
-
- "pextrh %[src0], %[ftmp1], %[two] \n\t"
- "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t"
- "pshufh %[ftmp1], %[ftmp1], %[one] \n\t"
-
- "pextrh %[src0], %[ftmp2], %[two] \n\t"
- "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t"
- "pextrh %[src0], %[ftmp2], %[one] \n\t"
- "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t"
- "pextrh %[src0], %[ftmp2], %[zero] \n\t"
- "pshufh %[src1], %[src1], %[mask1] \n\t"
- "pinsrh_0 %[src1], %[src1], %[src0] \n\t"
- "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
- "packushb %[src1], %[src1], %[zero] \n\t"
-
- "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t"
- "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t"
- "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t"
- "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t"
-
- "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
- "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
- [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
- : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
- [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
- [one] "f"(0x01), [two] "f"(0x02)
- : "memory");
-}
-
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
- uint64_t src0, src1;
- uint64_t ftmp[3];
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
-
- "punpcklbh %[b], %[src0], %[src1] \n\t"
- "punpckhbh %[g], %[src0], %[src1] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklbh %[b], %[src0], %[zero] \n\t"
- "punpckhbh %[g], %[src0], %[zero] \n\t"
- "punpcklbh %[r], %[src1], %[zero] \n\t"
-
- "psrlh %[b], %[b], %[three] \n\t"
- "psrlh %[g], %[g], %[two] \n\t"
- "psrlh %[r], %[r], %[three] \n\t"
-
- "psllh %[g], %[g], %[five] \n\t"
- "psllh %[r], %[r], %[eleven] \n\t"
- "or %[b], %[b], %[g] \n\t"
- "or %[b], %[b], %[r] \n\t"
-
- "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
- "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
-
- "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
- "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
- [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
- : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
- [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
- [eleven] "f"(0x0b)
- : "memory");
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB. When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix. But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- const uint32_t dither4,
- int width) {
- uint64_t src0, src1;
- uint64_t ftmp[3];
- uint64_t c0 = 0x00ff00ff00ff00ff;
-
- __asm__ volatile(
- "punpcklbh %[dither], %[dither], %[zero] \n\t"
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
-
- "punpcklbh %[b], %[src0], %[src1] \n\t"
- "punpckhbh %[g], %[src0], %[src1] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklbh %[b], %[src0], %[zero] \n\t"
- "punpckhbh %[g], %[src0], %[zero] \n\t"
- "punpcklbh %[r], %[src1], %[zero] \n\t"
-
- "paddh %[b], %[b], %[dither] \n\t"
- "paddh %[g], %[g], %[dither] \n\t"
- "paddh %[r], %[r], %[dither] \n\t"
- "pcmpgth %[src0], %[b], %[c0] \n\t"
- "or %[src0], %[src0], %[b] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "pcmpgth %[src0], %[g], %[c0] \n\t"
- "or %[src0], %[src0], %[g] \n\t"
- "and %[g], %[src0], %[c0] \n\t"
- "pcmpgth %[src0], %[r], %[c0] \n\t"
- "or %[src0], %[src0], %[r] \n\t"
- "and %[r], %[src0], %[c0] \n\t"
-
- "psrlh %[b], %[b], %[three] \n\t"
- "psrlh %[g], %[g], %[two] \n\t"
- "psrlh %[r], %[r], %[three] \n\t"
-
- "psllh %[g], %[g], %[five] \n\t"
- "psllh %[r], %[r], %[eleven] \n\t"
- "or %[b], %[b], %[g] \n\t"
- "or %[b], %[b], %[r] \n\t"
-
- "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
- "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
-
- "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
- "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
- [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
- : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
- [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
- [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
- : "memory");
-}
-
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- uint64_t src0, src1;
- uint64_t ftmp[4];
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
-
- "punpcklbh %[b], %[src0], %[src1] \n\t"
- "punpckhbh %[g], %[src0], %[src1] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklbh %[b], %[src0], %[zero] \n\t"
- "punpckhbh %[g], %[src0], %[zero] \n\t"
- "punpcklbh %[r], %[src1], %[zero] \n\t"
- "punpckhbh %[a], %[src1], %[zero] \n\t"
-
- "psrlh %[b], %[b], %[three] \n\t"
- "psrlh %[g], %[g], %[three] \n\t"
- "psrlh %[r], %[r], %[three] \n\t"
- "psrlh %[a], %[a], %[seven] \n\t"
-
- "psllh %[g], %[g], %[five] \n\t"
- "psllh %[r], %[r], %[ten] \n\t"
- "psllh %[a], %[a], %[fifteen] \n\t"
- "or %[b], %[b], %[g] \n\t"
- "or %[b], %[b], %[r] \n\t"
- "or %[b], %[b], %[a] \n\t"
-
- "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
- "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
-
- "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
- "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
- [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
- : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
- [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
- [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
- : "memory");
-}
-
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- uint64_t src0, src1;
- uint64_t ftmp[4];
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
-
- "punpcklbh %[b], %[src0], %[src1] \n\t"
- "punpckhbh %[g], %[src0], %[src1] \n\t"
- "punpcklbh %[src0], %[b], %[g] \n\t"
- "punpckhbh %[src1], %[b], %[g] \n\t"
- "punpcklbh %[b], %[src0], %[zero] \n\t"
- "punpckhbh %[g], %[src0], %[zero] \n\t"
- "punpcklbh %[r], %[src1], %[zero] \n\t"
- "punpckhbh %[a], %[src1], %[zero] \n\t"
-
- "psrlh %[b], %[b], %[four] \n\t"
- "psrlh %[g], %[g], %[four] \n\t"
- "psrlh %[r], %[r], %[four] \n\t"
- "psrlh %[a], %[a], %[four] \n\t"
-
- "psllh %[g], %[g], %[four] \n\t"
- "psllh %[r], %[r], %[eight] \n\t"
- "psllh %[a], %[a], %[twelve] \n\t"
- "or %[b], %[b], %[g] \n\t"
- "or %[b], %[b], %[r] \n\t"
- "or %[b], %[b], %[a] \n\t"
-
- "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
- "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
-
- "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
- "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
- "daddiu %[width], %[width], -0x04 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
- [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
- : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
- [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
- [twelve] "f"(0x0c)
- : "memory");
-}
-
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest0, dest1, dest2, dest3;
- const uint64_t value = 0x1080;
- const uint64_t mask = 0x0001004200810019;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[dest0], %[src] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[dest1], %[src] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[dest2], %[src] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[dest3], %[src] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
- [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
- [zero] "f"(0x00)
- : "memory");
-}
-
-void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
- "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
- "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
- "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
- "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest0, dest1, dest2, dest3;
- const uint64_t value = 0x1080;
- const uint64_t mask = 0x0019008100420001;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[dest0], %[src] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[dest1], %[src] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[dest2], %[src] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[dest3], %[src] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
- [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
- [zero] "f"(0x00)
- : "memory");
-}
-
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[dest0_u], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t"
- "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src1], %[src0] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src0], %[src1] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[dest1_u], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t"
- "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src1], %[src0] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src0], %[src1] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[dest2_u], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t"
- "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src1], %[src0] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src0], %[src1] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[dest3_u], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t"
- "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src1], %[src0] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src0], %[src1] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest0, dest1, dest2, dest3;
- const uint64_t value = 0x1080;
- const uint64_t mask = 0x0001001900810042;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[dest0], %[src] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[dest1], %[src] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[dest2], %[src] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[dest3], %[src] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
- [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
- [zero] "f"(0x00)
- : "memory");
-}
-
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
- "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src1], %[src0] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src0], %[src1] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
- "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src1], %[src0] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src0], %[src1] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
- "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src1], %[src0] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src0], %[src1] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
- "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src1], %[src0] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src0], %[src1] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest0, dest1, dest2, dest3;
- const uint64_t value = 0x1080;
- const uint64_t mask = 0x0042008100190001;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[dest0], %[src] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[dest1], %[src] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[dest2], %[src] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[dest3], %[src] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
- [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
- [zero] "f"(0x00)
- : "memory");
-}
-
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t"
- "dsrl %[dest0_v], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
- "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t"
- "dsrl %[dest1_v], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
- "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t"
- "dsrl %[dest2_v], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
- "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t"
- "dsrl %[dest3_v], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
- "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest0, dest1, dest2, dest3;
- const uint64_t value = 0x1080;
- const uint64_t mask = 0x0001004200810019;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[dest0], %[src] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[dest1], %[src] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[dest2], %[src] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[dest3], %[src] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
- [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
- [zero] "f"(0x00)
- : "memory");
-}
-
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
- "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
- "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
- "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
- "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest0, dest1, dest2, dest3;
- const uint64_t value = 0x1080;
- const uint64_t mask = 0x0001001900810042;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[dest0], %[src] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[dest1], %[src] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[dest2], %[src] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
- "punpcklbh %[src_lo], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "dsll %[src], %[src], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src], %[zero] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[dest3], %[src] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
- [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
- [zero] "f"(0x00)
- : "memory");
-}
-
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
- "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src1], %[src0] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src0], %[src1] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
- "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src1], %[src0] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src0], %[src1] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
- "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src1], %[src0] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src0], %[src1] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
- "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "dsll %[src0], %[src0], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "paddh %[src0], %[src_lo], %[src_hi] \n\t"
- "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_lo] \n\t"
- "dsll %[src1], %[src1], %[eight] \n\t"
- "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
- "paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
- "dsll %[src_hi], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src1], %[src0] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src0], %[src1] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
- uint64_t src, src_hi, src_lo;
- uint64_t dest, dest0, dest1, dest2, dest3;
- uint64_t tmp0, tmp1;
- const uint64_t shift = 0x07;
- const uint64_t value = 0x0040;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x00010026004B000FULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
- "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest0], %[dest0], %[shift] \n\t"
-
- "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
- "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest1], %[dest1], %[shift] \n\t"
-
- "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
- "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest2], %[dest2], %[shift] \n\t"
-
- "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
- "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest3], %[dest3], %[shift] \n\t"
-
- "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
- "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
- "packushb %[dest], %[tmp0], %[tmp1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
- [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
- [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
- [tmp1] "=&f"(tmp1)
- : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
- [width] "r"(width)
- : "memory");
-}
-
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src_rgb1;
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x002b0054007f0002;
- const uint64_t mask_v = 0x0002007f006b0014;
-
- __asm__ volatile(
- "1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
- "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
- "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
- "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
- "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
-
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
- "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
- "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "punpcklbh %[src0], %[src1], %[zero] \n\t"
- "punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
- "pavgh %[src0], %[src0], %[src1] \n\t"
- "dsll %[src_lo], %[src0], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
- [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
- [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
- [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
- [sixteen] "f"(0x10)
- : "memory");
-}
-
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
- uint64_t ftmp[11];
- const uint64_t value = 0x1080108010801080;
- const uint64_t mask = 0x0001004200810019;
- uint64_t c0 = 0x001f001f001f001f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t c2 = 0x0007000700070007;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g], %[src1], %[c2] \n\t"
- "psllh %[g], %[g], %[three] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "psrlh %[r], %[src1], %[three] \n\t"
- "psllh %[src0], %[b], %[three] \n\t"
- "psrlh %[src1], %[b], %[two] \n\t"
- "or %[b], %[src0], %[src1] \n\t"
- "psllh %[src0], %[g], %[two] \n\t"
- "psrlh %[src1], %[g], %[four] \n\t"
- "or %[g], %[src0], %[src1] \n\t"
- "psllh %[src0], %[r], %[three] \n\t"
- "psrlh %[src1], %[r], %[two] \n\t"
- "or %[r], %[src0], %[src1] \n\t"
- "punpcklhw %[src0], %[b], %[r] \n\t"
- "punpcklhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[src0], %[src1] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "punpckhhw %[src0], %[b], %[r] \n\t"
- "punpckhhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[src0], %[src1] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g], %[src1], %[c2] \n\t"
- "psllh %[g], %[g], %[three] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "psrlh %[r], %[src1], %[three] \n\t"
- "psllh %[src0], %[b], %[three] \n\t"
- "psrlh %[src1], %[b], %[two] \n\t"
- "or %[b], %[src0], %[src1] \n\t"
- "psllh %[src0], %[g], %[two] \n\t"
- "psrlh %[src1], %[g], %[four] \n\t"
- "or %[g], %[src0], %[src1] \n\t"
- "psllh %[src0], %[r], %[three] \n\t"
- "psrlh %[src1], %[r], %[two] \n\t"
- "or %[r], %[src0], %[src1] \n\t"
- "punpcklhw %[src0], %[b], %[r] \n\t"
- "punpcklhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[src0], %[src1] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "punpckhhw %[src0], %[b], %[r] \n\t"
- "punpckhhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[src0], %[src1] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddiu %[width], %[width], -0x08 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
- [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
- [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
- [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
- : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
- [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
- [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
- [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
- : "memory");
-}
-
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
- uint8_t* dst_y,
- int width) {
- uint64_t ftmp[11];
- const uint64_t value = 0x1080108010801080;
- const uint64_t mask = 0x0001004200810019;
- uint64_t c0 = 0x001f001f001f001f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t c2 = 0x0003000300030003;
- uint64_t c3 = 0x007c007c007c007c;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g], %[src1], %[c2] \n\t"
- "psllh %[g], %[g], %[three] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "and %[r], %[src1], %[c3] \n\t"
- "psrlh %[r], %[r], %[two] \n\t"
- "psllh %[src0], %[b], %[three] \n\t"
- "psrlh %[src1], %[b], %[two] \n\t"
- "or %[b], %[src0], %[src1] \n\t"
- "psllh %[src0], %[g], %[three] \n\t"
- "psrlh %[src1], %[g], %[two] \n\t"
- "or %[g], %[src0], %[src1] \n\t"
- "psllh %[src0], %[r], %[three] \n\t"
- "psrlh %[src1], %[r], %[two] \n\t"
- "or %[r], %[src0], %[src1] \n\t"
- "punpcklhw %[src0], %[b], %[r] \n\t"
- "punpcklhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[src0], %[src1] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "punpckhhw %[src0], %[b], %[r] \n\t"
- "punpckhhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[src0], %[src1] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g], %[src1], %[c2] \n\t"
- "psllh %[g], %[g], %[three] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "and %[r], %[src1], %[c3] \n\t"
- "psrlh %[r], %[r], %[two] \n\t"
- "psllh %[src0], %[b], %[three] \n\t"
- "psrlh %[src1], %[b], %[two] \n\t"
- "or %[b], %[src0], %[src1] \n\t"
- "psllh %[src0], %[g], %[three] \n\t"
- "psrlh %[src1], %[g], %[two] \n\t"
- "or %[g], %[src0], %[src1] \n\t"
- "psllh %[src0], %[r], %[three] \n\t"
- "psrlh %[src1], %[r], %[two] \n\t"
- "or %[r], %[src0], %[src1] \n\t"
- "punpcklhw %[src0], %[b], %[r] \n\t"
- "punpcklhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[src0], %[src1] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "punpckhhw %[src0], %[b], %[r] \n\t"
- "punpckhhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[src0], %[src1] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddiu %[width], %[width], -0x08 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
- [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
- [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
- [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
- : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
- [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
- [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
- [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
- : "memory");
-}
-
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
- uint8_t* dst_y,
- int width) {
- uint64_t ftmp[11];
- uint64_t value = 0x1080108010801080;
- uint64_t mask = 0x0001004200810019;
- uint64_t c0 = 0x000f000f000f000f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g], %[src0], %[four] \n\t"
- "and %[r], %[src1], %[c0] \n\t"
- "psllh %[src0], %[b], %[four] \n\t"
- "or %[b], %[src0], %[b] \n\t"
- "psllh %[src0], %[g], %[four] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "psllh %[src0], %[r], %[four] \n\t"
- "or %[r], %[src0], %[r] \n\t"
- "punpcklhw %[src0], %[b], %[r] \n\t"
- "punpcklhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest0], %[src0], %[src1] \n\t"
- "psrlw %[dest0], %[dest0], %[eight] \n\t"
-
- "punpckhhw %[src0], %[b], %[r] \n\t"
- "punpckhhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest1], %[src0], %[src1] \n\t"
- "psrlw %[dest1], %[dest1], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
- "psrlh %[src1], %[src0], %[eight] \n\t"
- "and %[b], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g], %[src0], %[four] \n\t"
- "and %[r], %[src1], %[c0] \n\t"
- "psllh %[src0], %[b], %[four] \n\t"
- "or %[b], %[src0], %[b] \n\t"
- "psllh %[src0], %[g], %[four] \n\t"
- "or %[g], %[src0], %[g] \n\t"
- "psllh %[src0], %[r], %[four] \n\t"
- "or %[r], %[src0], %[r] \n\t"
- "punpcklhw %[src0], %[b], %[r] \n\t"
- "punpcklhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest2], %[src0], %[src1] \n\t"
- "psrlw %[dest2], %[dest2], %[eight] \n\t"
-
- "punpckhhw %[src0], %[b], %[r] \n\t"
- "punpckhhw %[src1], %[g], %[value] \n\t"
- "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
- "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
- "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
- "paddw %[dest3], %[src0], %[src1] \n\t"
- "psrlw %[dest3], %[dest3], %[eight] \n\t"
-
- "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
- "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
-
- "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t"
- "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
- "daddiu %[width], %[width], -0x08 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
- [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
- [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
- [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
- : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
- [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
- [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
- : "memory");
-}
-
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t ftmp[13];
- uint64_t value = 0x2020202020202020;
- uint64_t mask_u = 0x0026004a00700002;
- uint64_t mask_v = 0x00020070005e0012;
- uint64_t mask = 0x93;
- uint64_t c0 = 0x001f001f001f001f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t c2 = 0x0007000700070007;
- __asm__ volatile(
- "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t"
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
- "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t"
- "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t"
- "psrlh %[dest0_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest0_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "psrlh %[r0], %[dest0_u], %[three] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest0_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest0_v], %[src0], %[c2] \n\t"
- "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
- "or %[dest0_v], %[src1], %[dest0_v] \n\t"
- "psrlh %[src0], %[src0], %[three] \n\t"
- "paddh %[b0], %[b0], %[dest0_u] \n\t"
- "paddh %[g0], %[g0], %[dest0_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest0_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
- "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t"
- "psrlh %[dest1_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest1_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "psrlh %[r0], %[dest1_u], %[three] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest1_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest1_v], %[src0], %[c2] \n\t"
- "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
- "or %[dest1_v], %[src1], %[dest1_v] \n\t"
- "psrlh %[src0], %[src0], %[three] \n\t"
- "paddh %[b0], %[b0], %[dest1_u] \n\t"
- "paddh %[g0], %[g0], %[dest1_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest1_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t"
- "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t"
- "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t"
- "psrlh %[dest2_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest2_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "psrlh %[r0], %[dest2_u], %[three] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest2_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest2_v], %[src0], %[c2] \n\t"
- "psllh %[dest2_v], %[dest2_v], %[three] \n\t"
- "or %[dest2_v], %[src1], %[dest2_v] \n\t"
- "psrlh %[src0], %[src0], %[three] \n\t"
- "paddh %[b0], %[b0], %[dest2_u] \n\t"
- "paddh %[g0], %[g0], %[dest2_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest2_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t"
- "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t"
- "psrlh %[dest3_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest3_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "psrlh %[r0], %[dest3_u], %[three] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest3_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest3_v], %[src0], %[c2] \n\t"
- "psllh %[dest3_v], %[dest3_v], %[three] \n\t"
- "or %[dest3_v], %[src1], %[dest3_v] \n\t"
- "psrlh %[src0], %[src0], %[three] \n\t"
- "paddh %[b0], %[b0], %[dest3_u] \n\t"
- "paddh %[g0], %[g0], %[dest3_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest3_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t"
- "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddiu %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
- [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
- [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
- [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
- [dest3_v] "=&f"(ftmp[12])
- : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
- [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
- [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
- [one] "f"(0x01)
- : "memory");
-}
-
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t ftmp[11];
- uint64_t value = 0x2020202020202020;
- uint64_t mask_u = 0x0026004a00700002;
- uint64_t mask_v = 0x00020070005e0012;
- uint64_t mask = 0x93;
- uint64_t c0 = 0x001f001f001f001f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t c2 = 0x0003000300030003;
- uint64_t c3 = 0x007c007c007c007c;
- __asm__ volatile(
- "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t"
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
- "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t"
- "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t"
- "psrlh %[dest0_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest0_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "and %[r0], %[dest0_u], %[c3] \n\t"
- "psrlh %[r0], %[r0], %[two] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest0_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest0_v], %[src0], %[c2] \n\t"
- "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
- "or %[dest0_v], %[src1], %[dest0_v] \n\t"
- "and %[src0], %[src0], %[c3] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "paddh %[b0], %[b0], %[dest0_u] \n\t"
- "paddh %[g0], %[g0], %[dest0_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[six] \n\t"
- "psllh %[g0], %[g0], %[one] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest0_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
- "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t"
- "psrlh %[dest1_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest1_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "and %[r0], %[dest1_u], %[c3] \n\t"
- "psrlh %[r0], %[r0], %[two] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest1_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest1_v], %[src0], %[c2] \n\t"
- "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
- "or %[dest1_v], %[src1], %[dest1_v] \n\t"
- "and %[src0], %[src0], %[c3] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "paddh %[b0], %[b0], %[dest1_u] \n\t"
- "paddh %[g0], %[g0], %[dest1_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[six] \n\t"
- "psllh %[g0], %[g0], %[one] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest1_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t"
- "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t"
- "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t"
- "psrlh %[dest2_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest2_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "and %[r0], %[dest2_u], %[c3] \n\t"
- "psrlh %[r0], %[r0], %[two] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest2_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest0_v], %[src0], %[c2] \n\t"
- "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
- "or %[dest0_v], %[src1], %[dest0_v] \n\t"
- "and %[src0], %[src0], %[c3] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "paddh %[b0], %[b0], %[dest2_u] \n\t"
- "paddh %[g0], %[g0], %[dest0_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest2_u], %[dest0_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[six] \n\t"
- "psllh %[g0], %[g0], %[one] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest2_u], %[dest0_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest2_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t"
- "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t"
- "psrlh %[dest3_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[src0], %[src0], %[five] \n\t"
- "and %[g0], %[dest3_u], %[c2] \n\t"
- "psllh %[g0], %[g0], %[three] \n\t"
- "or %[g0], %[src0], %[g0] \n\t"
- "and %[r0], %[dest3_u], %[c3] \n\t"
- "psrlh %[r0], %[r0], %[two] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest3_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[src1], %[src1], %[five] \n\t"
- "and %[dest1_v], %[src0], %[c2] \n\t"
- "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
- "or %[dest1_v], %[src1], %[dest1_v] \n\t"
- "and %[src0], %[src0], %[c3] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
- "paddh %[b0], %[b0], %[dest3_u] \n\t"
- "paddh %[g0], %[g0], %[dest1_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest3_u], %[dest1_v] \n\t"
- "psrlh %[b0], %[src0], %[six] \n\t"
- "psllh %[r0], %[src0], %[one] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[six] \n\t"
- "psllh %[g0], %[g0], %[one] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest3_u], %[dest1_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest3_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[dest0_u], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
- "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t"
- "packushb %[dest0_v], %[dest1_u], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t"
- "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddiu %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
- [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
- [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
- [dest1_v] "=&f"(ftmp[10])
- : [src_argb1555] "r"(src_argb1555),
- [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
- [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
- [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
- [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
- [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
- [two] "f"(0x02), [one] "f"(0x01)
- : "memory");
-}
-
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t ftmp[13];
- uint64_t value = 0x2020202020202020;
- uint64_t mask_u = 0x0026004a00700002;
- uint64_t mask_v = 0x00020070005e0012;
- uint64_t mask = 0x93;
- uint64_t c0 = 0x000f000f000f000f;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- __asm__ volatile(
- "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t"
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
- "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t"
- "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t"
- "psrlh %[dest0_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g0], %[src0], %[four] \n\t"
- "and %[r0], %[dest0_u], %[c0] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest0_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[dest0_v], %[src1], %[four] \n\t"
- "and %[src0], %[src0], %[c0] \n\t"
- "paddh %[b0], %[b0], %[dest0_u] \n\t"
- "paddh %[g0], %[g0], %[dest0_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
- "psrlh %[b0], %[src0], %[four] \n\t"
- "psllh %[r0], %[src0], %[two] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[four] \n\t"
- "psllh %[g0], %[g0], %[two] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest0_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
- "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t"
- "psrlh %[dest1_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g0], %[src0], %[four] \n\t"
- "and %[r0], %[dest1_u], %[c0] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest1_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[dest1_v], %[src1], %[four] \n\t"
- "and %[src0], %[src0], %[c0] \n\t"
- "paddh %[b0], %[b0], %[dest1_u] \n\t"
- "paddh %[g0], %[g0], %[dest1_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
- "psrlh %[b0], %[src0], %[four] \n\t"
- "psllh %[r0], %[src0], %[two] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[four] \n\t"
- "psllh %[g0], %[g0], %[two] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest1_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t"
- "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t"
- "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t"
- "psrlh %[dest2_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g0], %[src0], %[four] \n\t"
- "and %[r0], %[dest2_u], %[c0] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest2_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[dest2_v], %[src1], %[four] \n\t"
- "and %[src0], %[src0], %[c0] \n\t"
- "paddh %[b0], %[b0], %[dest2_u] \n\t"
- "paddh %[g0], %[g0], %[dest2_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
- "psrlh %[b0], %[src0], %[four] \n\t"
- "psllh %[r0], %[src0], %[two] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[four] \n\t"
- "psllh %[g0], %[g0], %[two] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest2_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t"
- "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t"
- "psrlh %[dest3_u], %[src0], %[eight] \n\t"
- "and %[b0], %[src0], %[c0] \n\t"
- "and %[src0], %[src0], %[c1] \n\t"
- "psrlh %[g0], %[src0], %[four] \n\t"
- "and %[r0], %[dest3_u], %[c0] \n\t"
- "psrlh %[src0], %[src1], %[eight] \n\t"
- "and %[dest3_u], %[src1], %[c0] \n\t"
- "and %[src1], %[src1], %[c1] \n\t"
- "psrlh %[dest3_v], %[src1], %[four] \n\t"
- "and %[src0], %[src0], %[c0] \n\t"
- "paddh %[b0], %[b0], %[dest3_u] \n\t"
- "paddh %[g0], %[g0], %[dest3_v] \n\t"
- "paddh %[r0], %[r0], %[src0] \n\t"
- "punpcklhw %[src0], %[b0], %[r0] \n\t"
- "punpckhhw %[src1], %[b0], %[r0] \n\t"
- "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
- "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
- "psrlh %[b0], %[src0], %[four] \n\t"
- "psllh %[r0], %[src0], %[two] \n\t"
- "or %[b0], %[b0], %[r0] \n\t"
- "psrlh %[r0], %[g0], %[four] \n\t"
- "psllh %[g0], %[g0], %[two] \n\t"
- "or %[g0], %[g0], %[r0] \n\t"
- "punpcklhw %[src0], %[g0], %[value] \n\t"
- "punpckhhw %[src1], %[g0], %[value] \n\t"
- "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
- "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
- "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
- "punpcklhw %[src0], %[b0], %[g0] \n\t"
- "punpckhhw %[src1], %[b0], %[g0] \n\t"
-
- "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
- "pshufh %[dest3_u], %[src0], %[mask] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
- "pshufh %[b0], %[src1], %[mask] \n\t"
- "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t"
- "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddiu %[width], %[width], -0x10 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
- [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
- [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
- [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
- [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
- [dest3_v] "=&f"(ftmp[12])
- : [src_argb4444] "r"(src_argb4444),
- [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
- [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
- [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
- [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
- [two] "f"(0x02)
- : "memory");
-}
-
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t ftmp[12];
- const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t"
- "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
- "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t"
- "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
- "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
-
- "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
- "psubw %[dest0_u], %[src0], %[src1] \n\t"
- "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
- "psubw %[dest0_v], %[src1], %[src0] \n\t"
- "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t"
- "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
- "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t"
- "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
- "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
- "psubw %[dest1_u], %[src0], %[src1] \n\t"
- "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
- "psubw %[dest1_v], %[src1], %[src0] \n\t"
- "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t"
- "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
- "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t"
- "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
- "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
- "psubw %[dest2_u], %[src0], %[src1] \n\t"
- "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
- "psubw %[dest2_v], %[src1], %[src0] \n\t"
- "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
-
- "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t"
- "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
- "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
- "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t"
- "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
- "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t"
- "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
- "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
- "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
- "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
- "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
-
- "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
- "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
- "psubw %[dest3_u], %[src0], %[src1] \n\t"
- "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
- "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
- "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
- "psubw %[dest3_v], %[src1], %[src0] \n\t"
- "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
-
- "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
- "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
- "packushb %[dest0_u], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
- "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
-
- "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
- "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
- "packushb %[dest0_v], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
- "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
-
- "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
- "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
- "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bgtz %[width], 1b \n\t"
- : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
- [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
- [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
- [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
- [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
- [dest3_v] "=&f"(ftmp[11])
- : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
- [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
- [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
- [eight] "f"(0x08)
- : "memory");
-}
-
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
- uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
- uint64_t tmp0, tmp1;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x01;
- const uint64_t mask2 = 0x00400026004B000FULL;
- const uint64_t mask3 = 0xFF000000FF000000ULL;
- const uint64_t mask4 = ~mask3;
- const uint64_t shift = 0x07;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
-
- "and %[src37], %[src], %[mask3] \n\t"
-
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t"
- "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t"
- "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t"
- "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t"
- "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t"
- "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t"
-
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
- "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t"
- "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t"
- "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t"
- "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t"
- "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t"
- "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "and %[dest], %[dest], %[mask4] \n\t"
- "or %[dest], %[dest], %[src37] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
- [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
- [src37] "=&f"(src37)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
- [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
- [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
- : "memory");
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
- uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
- uint64_t tmp0, tmp1;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x002300440011ULL;
- const uint64_t mask2 = 0x002D00580016ULL;
- const uint64_t mask3 = 0x003200620018ULL;
- const uint64_t mask4 = 0xFF000000FF000000ULL;
- const uint64_t shift = 0x07;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "and %[dest37], %[dest], %[mask4] \n\t"
-
- "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t"
- "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t"
- "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t"
- "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t"
- "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
- "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
- "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest0], %[dest0], %[shift] \n\t"
- "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
- "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
- "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest1], %[dest1], %[shift] \n\t"
- "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
-
- "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t"
- "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t"
- "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t"
- "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t"
- "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
- "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
- "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest0], %[dest0], %[shift] \n\t"
- "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
- "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
- "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
- "psrlw %[dest1], %[dest1], %[shift] \n\t"
- "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "or %[dest], %[dest], %[dest37] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
- [dest] "=&f"(dest)
- : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
- [mask4] "f"(mask4), [shift] "f"(shift)
- : "memory");
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const int8_t* matrix_argb,
- int width) {
- uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
- dest3;
- uint64_t matrix, matrix_hi, matrix_lo;
- uint64_t tmp0, tmp1;
- const uint64_t shift0 = 0x06;
- const uint64_t shift1 = 0x08;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x08;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
-
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
-
- "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
- "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
- "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
- "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
- "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
- "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
- "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
- "psraw %[dest0], %[dest0], %[shift0] \n\t"
-
- "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
- "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
- "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
- "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
- "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
- "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
- "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
- "psraw %[dest1], %[dest1], %[shift0] \n\t"
-
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
-
- "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
- "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
- "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
- "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
- "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
- "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
- "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
- "psraw %[dest2], %[dest2], %[shift0] \n\t"
-
- "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
- "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
- "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
- "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
- "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
- "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
- "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
- "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
- "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
- "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
- "psraw %[dest3], %[dest3], %[shift0] \n\t"
-
- "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
- "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
- "packushb %[dest], %[tmp0], %[tmp1] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
- [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
- [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
- : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
- [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
- [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
- : "memory");
-}
-
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- uint32_t value) {
- uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
- const uint64_t shift = 0x08;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[src] \n\t"
- "punpckhbh %[src_hi], %[src], %[src] \n\t"
-
- "punpcklbh %[value], %[value], %[value] \n\t"
-
- "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t"
- "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
- "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t"
- "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
- [dest] "=&f"(dest)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
- [value] "f"(value), [shift] "f"(shift)
- : "memory");
-}
-
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
- uint64_t dest, dest_lo, dest_hi;
- const uint64_t mask = 0x0;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
- "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "punpcklbh %[src0_lo], %[src0], %[src0] \n\t"
- "punpckhbh %[src0_hi], %[src0], %[src0] \n\t"
-
- "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "punpcklbh %[src1_lo], %[src1], %[mask] \n\t"
- "punpckhbh %[src1_hi], %[src1], %[mask] \n\t"
-
- "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t"
- "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t"
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
- [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
- [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
- [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
- : "memory");
-}
-
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- uint64_t src0, src1, dest;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
- "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "paddusb %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
- [dst_ptr] "r"(dst_argb), [width] "r"(width)
- : "memory");
-}
-
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- uint64_t src0, src1, dest;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
- "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "psubusb %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
- [dst_ptr] "r"(dst_argb), [width] "r"(width)
- : "memory");
-}
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_MMI(const uint8_t* src_y0,
- const uint8_t* src_y1,
- const uint8_t* src_y2,
- uint8_t* dst_sobelx,
- int width) {
- uint64_t y00 = 0, y10 = 0, y20 = 0;
- uint64_t y02 = 0, y12 = 0, y22 = 0;
- uint64_t zero = 0x0;
- uint64_t sobel = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
- "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
- "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2]
- "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
-
- "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i]
- "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
- "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2]
- "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
-
- "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i]
- "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t"
- "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2]
- "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t"
-
- "punpcklbh %[y00], %[y00], %[zero] \n\t"
- "punpcklbh %[y10], %[y10], %[zero] \n\t"
- "punpcklbh %[y20], %[y20], %[zero] \n\t"
-
- "punpcklbh %[y02], %[y02], %[zero] \n\t"
- "punpcklbh %[y12], %[y12], %[zero] \n\t"
- "punpcklbh %[y22], %[y22], %[zero] \n\t"
-
- "paddh %[y00], %[y00], %[y10] \n\t" // a+b
- "paddh %[y20], %[y20], %[y10] \n\t" // c+b
- "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c
-
- "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub
- "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub
- "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub
-
- "pmaxsh %[y10], %[y00], %[y02] \n\t"
- "pminsh %[y20], %[y00], %[y02] \n\t"
- "psubh %[sobel], %[y10], %[y20] \n\t" // Abs
-
- "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
- "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
- "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
- "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
-
- "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
- "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
- "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
- "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
-
- "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t"
- "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t"
- "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t"
- "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t"
-
- "punpcklbh %[y00], %[y00], %[zero] \n\t"
- "punpcklbh %[y10], %[y10], %[zero] \n\t"
- "punpcklbh %[y20], %[y20], %[zero] \n\t"
-
- "punpcklbh %[y02], %[y02], %[zero] \n\t"
- "punpcklbh %[y12], %[y12], %[zero] \n\t"
- "punpcklbh %[y22], %[y22], %[zero] \n\t"
-
- "paddh %[y00], %[y00], %[y10] \n\t"
- "paddh %[y20], %[y20], %[y10] \n\t"
- "paddh %[y00], %[y00], %[y20] \n\t"
-
- "paddh %[y02], %[y02], %[y12] \n\t"
- "paddh %[y22], %[y22], %[y12] \n\t"
- "paddh %[y02], %[y02], %[y22] \n\t"
-
- "pmaxsh %[y10], %[y00], %[y02] \n\t"
- "pminsh %[y20], %[y00], %[y02] \n\t"
- "psubh %[y00], %[y10], %[y20] \n\t"
-
- "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
- "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t"
- "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t"
-
- "daddiu %[src_y0], %[src_y0], 8 \n\t"
- "daddiu %[src_y1], %[src_y1], 8 \n\t"
- "daddiu %[src_y2], %[src_y2], 8 \n\t"
- "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
- [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
- : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
- [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
- : "memory");
-}
-
-void SobelYRow_MMI(const uint8_t* src_y0,
- const uint8_t* src_y1,
- uint8_t* dst_sobely,
- int width) {
- uint64_t y00 = 0, y01 = 0, y02 = 0;
- uint64_t y10 = 0, y11 = 0, y12 = 0;
- uint64_t zero = 0x0;
- uint64_t sobel = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
- "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
- "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1]
- "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t"
- "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2]
- "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
-
- "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i]
- "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
- "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1]
- "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t"
- "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2]
- "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
-
- "punpcklbh %[y00], %[y00], %[zero] \n\t"
- "punpcklbh %[y01], %[y01], %[zero] \n\t"
- "punpcklbh %[y02], %[y02], %[zero] \n\t"
-
- "punpcklbh %[y10], %[y10], %[zero] \n\t"
- "punpcklbh %[y11], %[y11], %[zero] \n\t"
- "punpcklbh %[y12], %[y12], %[zero] \n\t"
-
- "paddh %[y00], %[y00], %[y01] \n\t" // a+b
- "paddh %[y02], %[y02], %[y01] \n\t" // c+b
- "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c
-
- "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub
- "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub
- "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub
-
- "pmaxsh %[y02], %[y00], %[y10] \n\t"
- "pminsh %[y12], %[y00], %[y10] \n\t"
- "psubh %[sobel], %[y02], %[y12] \n\t" // Abs
-
- "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
- "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
- "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t"
- "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t"
- "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
- "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
-
- "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
- "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
- "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t"
- "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t"
- "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
- "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
-
- "punpcklbh %[y00], %[y00], %[zero] \n\t"
- "punpcklbh %[y01], %[y01], %[zero] \n\t"
- "punpcklbh %[y02], %[y02], %[zero] \n\t"
-
- "punpcklbh %[y10], %[y10], %[zero] \n\t"
- "punpcklbh %[y11], %[y11], %[zero] \n\t"
- "punpcklbh %[y12], %[y12], %[zero] \n\t"
-
- "paddh %[y00], %[y00], %[y01] \n\t"
- "paddh %[y02], %[y02], %[y01] \n\t"
- "paddh %[y00], %[y00], %[y02] \n\t"
-
- "paddh %[y10], %[y10], %[y11] \n\t"
- "paddh %[y12], %[y12], %[y11] \n\t"
- "paddh %[y10], %[y10], %[y12] \n\t"
-
- "pmaxsh %[y02], %[y00], %[y10] \n\t"
- "pminsh %[y12], %[y00], %[y10] \n\t"
- "psubh %[y00], %[y02], %[y12] \n\t"
-
- "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
- "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t"
- "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t"
-
- "daddiu %[src_y0], %[src_y0], 8 \n\t"
- "daddiu %[src_y1], %[src_y1], 8 \n\t"
- "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
- [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
- : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
- [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
- : "memory");
-}
-
-void SobelRow_MMI(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- double temp[3];
- uint64_t c1 = 0xff000000ff000000;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i]
- "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t"
- "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
- "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t"
- // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
- "paddusb %[t2] , %[t0], %[t1] \n\t"
-
- // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
- "punpcklbh %[t0], %[t2], %[t2] \n\t"
-
- // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
- "punpcklbh %[t1], %[t0], %[t0] \n\t"
- "or %[t1], %[t1], %[c1] \n\t"
- // 255 s1 s1 s1 s55 s0 s0 s0
- "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t"
- "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t"
-
- // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
- "punpckhbh %[t1], %[t0], %[t0] \n\t"
- "or %[t1], %[t1], %[c1] \n\t"
- // 255 s3 s3 s3 255 s2 s2 s2
- "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t"
- "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t"
-
- // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
- "punpckhbh %[t0], %[t2], %[t2] \n\t"
-
- // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
- "punpcklbh %[t1], %[t0], %[t0] \n\t"
- "or %[t1], %[t1], %[c1] \n\t"
- "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t"
- "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t"
-
- // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
- "punpckhbh %[t1], %[t0], %[t0] \n\t"
- "or %[t1], %[t1], %[c1] \n\t"
- "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t"
- "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t"
-
- "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
- "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
- "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
- : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
- [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
- : "memory");
-}
-
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_y,
- int width) {
- uint64_t tr = 0;
- uint64_t tb = 0;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t"
- "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i]
- "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t"
- "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i]
- "paddusb %[tr], %[tr], %[tb] \n\t" // g
- "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t"
- "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t"
-
- "daddiu %[dst_y], %[dst_y], 8 \n\t"
- "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
- "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [tr] "=&f"(tr), [tb] "=&f"(tb)
- : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
- [dst_y] "r"(dst_y), [width] "r"(width)
- : "memory");
-}
-
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- uint64_t temp[3];
- uint64_t result = 0;
- uint64_t gb = 0;
- uint64_t cr = 0;
- uint64_t c1 = 0xffffffffffffffff;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i]
- "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t"
- "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
- "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t"
- "paddusb %[tg] , %[tr], %[tb] \n\t" // g
-
- // g3 b3 g2 b2 g1 b1 g0 b0
- "punpcklbh %[gb], %[tb], %[tg] \n\t"
- // c3 r3 r2 r2 c1 r1 c0 r0
- "punpcklbh %[cr], %[tr], %[c1] \n\t"
- // c1 r1 g1 b1 c0 r0 g0 b0
- "punpcklhw %[result], %[gb], %[cr] \n\t"
- "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t"
- "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t"
- // c3 r3 g3 b3 c2 r2 g2 b2
- "punpckhhw %[result], %[gb], %[cr] \n\t"
- "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t"
- "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t"
-
- // g7 b7 g6 b6 g5 b5 g4 b4
- "punpckhbh %[gb], %[tb], %[tg] \n\t"
- // c7 r7 c6 r6 c5 r5 c4 r4
- "punpckhbh %[cr], %[tr], %[c1] \n\t"
- // c5 r5 g5 b5 c4 r4 g4 b4
- "punpcklhw %[result], %[gb], %[cr] \n\t"
- "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t"
- "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t"
- // c7 r7 g7 b7 c6 r6 g6 b6
- "punpckhhw %[result], %[gb], %[cr] \n\t"
- "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t"
- "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t"
-
- "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
- "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
- "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
- [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
- : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
- [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
- : "memory");
-}
-
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- // Copy a Y to RGB.
- uint64_t src, dest;
- const uint64_t mask0 = 0x00ffffff00ffffffULL;
- const uint64_t mask1 = ~mask0;
-
- __asm__ volatile(
- "1: \n\t"
- "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
- "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src], %[src], %[src] \n\t"
- "punpcklhw %[dest], %[src], %[src] \n\t"
- "and %[dest], %[dest], %[mask0] \n\t"
- "or %[dest], %[dest], %[mask1] \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
-
- "punpckhhw %[dest], %[src], %[src] \n\t"
- "and %[dest], %[dest], %[mask0] \n\t"
- "or %[dest], %[dest], %[mask1] \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [width] "r"(width)
- : "memory");
-}
-
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
- uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x55;
- const uint64_t mask2 = 0xAA;
- const uint64_t mask3 = 0xFF;
- const uint64_t mask4 = 0x4A354A354A354A35ULL;
- const uint64_t mask5 = 0x0488048804880488ULL;
- const uint64_t shift0 = 0x08;
- const uint64_t shift1 = 0x06;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
-
- "pshufh %[src], %[src_lo], %[mask0] \n\t"
- "psllh %[dest_lo], %[src], %[shift0] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
- "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
- "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
- "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
- "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
- "pshufh %[src], %[src_lo], %[mask1] \n\t"
- "psllh %[dest_hi], %[src], %[shift0] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
- "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
- "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
- "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
- "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "pshufh %[src], %[src_lo], %[mask2] \n\t"
- "psllh %[dest_lo], %[src], %[shift0] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
- "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
- "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
- "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
- "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
- "pshufh %[src], %[src_lo], %[mask3] \n\t"
- "psllh %[dest_hi], %[src], %[shift0] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
- "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
- "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
- "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
- "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
-
- "pshufh %[src], %[src_hi], %[mask0] \n\t"
- "psllh %[dest_lo], %[src], %[shift0] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
- "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
- "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
- "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
- "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
- "pshufh %[src], %[src_hi], %[mask1] \n\t"
- "psllh %[dest_hi], %[src], %[shift0] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
- "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
- "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
- "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
- "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
-
- "pshufh %[src], %[src_hi], %[mask2] \n\t"
- "psllh %[dest_lo], %[src], %[shift0] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
- "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
- "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
- "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
- "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
- "pshufh %[src], %[src_hi], %[mask3] \n\t"
- "psllh %[dest_hi], %[src], %[shift0] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
- "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
- "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
- "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
- "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
-
- "daddi %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
- [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo)
- : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
- [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
- [shift1] "f"(shift1), [width] "r"(width)
- : "memory");
-}
-
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
- uint64_t source, src0, src1, dest;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x1b;
-
- src += width - 1;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[source], 0(%[src_ptr]) \n\t"
- "gsldrc1 %[source], -7(%[src_ptr]) \n\t"
- "punpcklbh %[src0], %[source], %[mask0] \n\t"
- "pshufh %[src0], %[src0], %[mask1] \n\t"
- "punpckhbh %[src1], %[source], %[mask0] \n\t"
- "pshufh %[src1], %[src1], %[mask1] \n\t"
- "packushb %[dest], %[src1], %[src0] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddi %[src_ptr], %[src_ptr], -0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
- [src1] "=&f"(src1)
- : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [width] "r"(width)
- : "memory");
-}
-
-void MirrorUVRow_MMI(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t src0, src1, dest0, dest1;
- const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
- const uint64_t mask1 = 0x1b;
- const uint64_t shift = 0x08;
-
- src_uv += (width - 1) << 1;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src0], 1(%[src_ptr]) \n\t"
- "gsldrc1 %[src0], -6(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], -7(%[src_ptr]) \n\t"
- "gsldrc1 %[src1], -14(%[src_ptr]) \n\t"
-
- "and %[dest0], %[src0], %[mask0] \n\t"
- "pshufh %[dest0], %[dest0], %[mask1] \n\t"
- "and %[dest1], %[src1], %[mask0] \n\t"
- "pshufh %[dest1], %[dest1], %[mask1] \n\t"
- "packushb %[dest0], %[dest0], %[dest1] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t"
-
- "psrlh %[dest0], %[src0], %[shift] \n\t"
- "pshufh %[dest0], %[dest0], %[mask1] \n\t"
- "psrlh %[dest1], %[src1], %[shift] \n\t"
- "pshufh %[dest1], %[dest1], %[mask1] \n\t"
- "packushb %[dest0], %[dest0], %[dest1] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t"
-
- "daddi %[src_ptr], %[src_ptr], -0x10 \n\t"
- "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t"
- "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
- [src1] "=&f"(src1)
- : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
- [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
- [shift] "f"(shift)
- : "memory");
-}
-
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
- src += (width - 1) * 4;
- uint64_t temp = 0x0;
- uint64_t shuff = 0x4e; // 01 00 11 10
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[temp], 3(%[src]) \n\t"
- "gsldrc1 %[temp], -4(%[src]) \n\t"
- "pshufh %[temp], %[temp], %[shuff] \n\t"
- "gssdrc1 %[temp], 0x0(%[dst]) \n\t"
- "gssdlc1 %[temp], 0x7(%[dst]) \n\t"
-
- "daddiu %[src], %[src], -0x08 \n\t"
- "daddiu %[dst], %[dst], 0x08 \n\t"
- "daddiu %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [temp] "=&f"(temp)
- : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
- : "memory");
-}
-
-void SplitUVRow_MMI(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t c0 = 0x00ff00ff00ff00ff;
- uint64_t temp[4];
- uint64_t shift = 0x08;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t"
-
- "and %[t2], %[t0], %[c0] \n\t"
- "and %[t3], %[t1], %[c0] \n\t"
- "packushb %[t2], %[t2], %[t3] \n\t"
- "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t"
- "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t"
-
- "psrlh %[t2], %[t0], %[shift] \n\t"
- "psrlh %[t3], %[t1], %[shift] \n\t"
- "packushb %[t2], %[t2], %[t3] \n\t"
- "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t"
- "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t"
-
- "daddiu %[src_uv], %[src_uv], 16 \n\t"
- "daddiu %[dst_u], %[dst_u], 8 \n\t"
- "daddiu %[dst_v], %[dst_v], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
- [t3] "=&f"(temp[3])
- : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
- [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
- : "memory");
-}
-
-void MergeUVRow_MMI(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- uint64_t temp[3];
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x0(%[src_u]) \n\t"
- "gsldlc1 %[t0], 0x7(%[src_u]) \n\t"
- "gsldrc1 %[t1], 0x0(%[src_v]) \n\t"
- "gsldlc1 %[t1], 0x7(%[src_v]) \n\t"
- "punpcklbh %[t2], %[t0], %[t1] \n\t"
- "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t"
- "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t"
- "punpckhbh %[t2], %[t0], %[t1] \n\t"
- "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t"
- "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t"
-
- "daddiu %[src_u], %[src_u], 8 \n\t"
- "daddiu %[src_v], %[src_v], 8 \n\t"
- "daddiu %[dst_uv], %[dst_uv], 16 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
- : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
- [width] "r"(width)
- : "memory");
-}
-
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
- uint8_t* dst_r,
- uint8_t* dst_g,
- uint8_t* dst_b,
- int width) {
- uint64_t src[4];
- uint64_t dest_hi, dest_lo, dest;
-
- __asm__ volatile(
- "1: \n\t"
- "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
- "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
- "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
- "punpcklbh %[dest_lo], %[src0], %[src1] \n\t"
- "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t"
- "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t"
- "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t"
- "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t"
- "punpcklbh %[dest_hi], %[src2], %[src3] \n\t"
-
- "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t"
- "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t"
- "punpckhwd %[dest], %[dest], %[dest] \n\t"
- "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t"
- "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t"
- "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t"
- "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
- "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t"
- "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t"
- "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
- [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
- [dstb_ptr] "r"(dst_b), [width] "r"(width)
- : "memory");
-}
-
-void MergeRGBRow_MMI(const uint8_t* src_r,
- const uint8_t* src_g,
- const uint8_t* src_b,
- uint8_t* dst_rgb,
- int width) {
- uint64_t srcr, srcg, srcb, dest;
- uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
- const uint64_t temp = 0x0;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t"
- "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t"
- "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t"
- "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t"
- "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t"
- "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t"
-
- "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t"
- "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t"
- "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t"
- "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t"
-
- "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
- "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
- "punpckhwd %[dest], %[dest], %[dest] \n\t"
- "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t"
- "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
- "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t"
- "punpckhwd %[dest], %[dest], %[dest] \n\t"
- "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t"
- "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
- "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
- "punpckhwd %[dest], %[dest], %[dest] \n\t"
- "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
- "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t"
- "punpckhwd %[dest], %[dest], %[dest] \n\t"
- "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t"
- "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t"
-
- "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t"
- "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t"
- "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
- [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
- [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
- [srcbz_lo] "=&f"(srcbz_lo)
- : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
- [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
- : "memory");
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
- int src_stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t c0 = 0xff00ff00ff00ff00;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t temp[3];
- uint64_t data[4];
- uint64_t shift = 0x08;
- uint64_t src_stride = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
- "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t"
- "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
- "pavgb %[t0], %[t0], %[t1] \n\t"
-
- "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t"
- "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
- "pavgb %[t1], %[t2], %[t1] \n\t"
-
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "psrlh %[t0], %[t0], %[shift] \n\t"
- "psrlh %[t1], %[t1], %[shift] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d0], %[t0], %[c1] \n\t"
- "psrlh %[d1], %[t1], %[shift] \n\t"
-
- "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
- "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
- "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
- "pavgb %[t0], %[t0], %[t1] \n\t"
-
- "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t"
- "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t"
- "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
- "pavgb %[t1], %[t2], %[t1] \n\t"
-
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "psrlh %[t0], %[t0], %[shift] \n\t"
- "psrlh %[t1], %[t1], %[shift] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d2], %[t0], %[c1] \n\t"
- "psrlh %[d3], %[t1], %[shift] \n\t"
-
- "packushb %[d0], %[d0], %[d2] \n\t"
- "packushb %[d1], %[d1], %[d3] \n\t"
- "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
- "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
- "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
- "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
- "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
- "daddiu %[dst_u], %[dst_u], 8 \n\t"
- "daddiu %[dst_v], %[dst_v], 8 \n\t"
- "daddiu %[width], %[width], -16 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
- [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
- [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
- : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
- : "memory");
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- uint64_t c0 = 0xff00ff00ff00ff00;
- uint64_t c1 = 0x00ff00ff00ff00ff;
- uint64_t temp[2];
- uint64_t data[4];
- uint64_t shift = 0x08;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "psrlh %[t0], %[t0], %[shift] \n\t"
- "psrlh %[t1], %[t1], %[shift] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d0], %[t0], %[c1] \n\t"
- "psrlh %[d1], %[t1], %[shift] \n\t"
-
- "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
- "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
- "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t"
- "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t"
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "psrlh %[t0], %[t0], %[shift] \n\t"
- "psrlh %[t1], %[t1], %[shift] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d2], %[t0], %[c1] \n\t"
- "psrlh %[d3], %[t1], %[shift] \n\t"
-
- "packushb %[d0], %[d0], %[d2] \n\t"
- "packushb %[d1], %[d1], %[d3] \n\t"
- "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
- "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
- "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
- "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
- "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
- "daddiu %[dst_u], %[dst_u], 8 \n\t"
- "daddiu %[dst_v], %[dst_v], 8 \n\t"
- "daddiu %[width], %[width], -16 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
- [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
- : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
- [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
- : "memory");
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
- uint64_t c0 = 0x00ff00ff00ff00ff;
- uint64_t temp[2];
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
- "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
- "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t"
- "daddiu %[dst_y], %[dst_y], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
- : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
- [c0] "f"(c0)
- : "memory");
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
- int src_stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- // Output a row of UV values.
- uint64_t c0 = 0x00ff00ff00ff00ff;
- uint64_t temp[3];
- uint64_t data[4];
- uint64_t shift = 0x08;
- uint64_t src_stride = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
- "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t"
- "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
- "pavgb %[t0], %[t0], %[t1] \n\t"
-
- "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t"
- "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
- "pavgb %[t1], %[t2], %[t1] \n\t"
-
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d0], %[t0], %[c0] \n\t"
- "psrlh %[d1], %[t1], %[shift] \n\t"
-
- "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
- "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
- "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
- "pavgb %[t0], %[t0], %[t1] \n\t"
-
- "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t"
- "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t"
- "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
- "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
- "pavgb %[t1], %[t2], %[t1] \n\t"
-
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d2], %[t0], %[c0] \n\t"
- "psrlh %[d3], %[t1], %[shift] \n\t"
-
- "packushb %[d0], %[d0], %[d2] \n\t"
- "packushb %[d1], %[d1], %[d3] \n\t"
- "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
- "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
- "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
- "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
- "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
- "daddiu %[dst_u], %[dst_u], 8 \n\t"
- "daddiu %[dst_v], %[dst_v], 8 \n\t"
- "daddiu %[width], %[width], -16 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
- [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
- [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
- : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
- [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
- [c0] "f"(c0), [shift] "f"(shift)
- : "memory");
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- // Output a row of UV values.
- uint64_t c0 = 0x00ff00ff00ff00ff;
- uint64_t temp[2];
- uint64_t data[4];
- uint64_t shift = 0x08;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d0], %[t0], %[c0] \n\t"
- "psrlh %[d1], %[t1], %[shift] \n\t"
-
- "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
- "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
- "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t"
- "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t"
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "mov.s %[t1], %[t0] \n\t"
- "and %[d2], %[t0], %[c0] \n\t"
- "psrlh %[d3], %[t1], %[shift] \n\t"
-
- "packushb %[d0], %[d0], %[d2] \n\t"
- "packushb %[d1], %[d1], %[d3] \n\t"
- "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
- "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
- "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
- "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
- "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
- "daddiu %[dst_u], %[dst_u], 8 \n\t"
- "daddiu %[dst_v], %[dst_v], 8 \n\t"
- "daddiu %[width], %[width], -16 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
- [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
- : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
- [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
- : "memory");
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
- // Output a row of Y values.
- uint64_t c0 = 0x00ff00ff00ff00ff;
- uint64_t shift = 0x08;
- uint64_t temp[2];
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
- "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
- "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
- "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
- "dsrl %[t0], %[t0], %[shift] \n\t"
- "dsrl %[t1], %[t1], %[shift] \n\t"
- "and %[t0], %[t0], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "and %[t1], %[t1], %[c0] \n\t"
- "packushb %[t0], %[t0], %[t1] \n\t"
- "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
- "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
- "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t"
- "daddiu %[dst_y], %[dst_y], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
- : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
- [c0] "f"(c0), [shift] "f"(shift)
- : "memory");
-}
-
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
- dest_lo;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
- const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
- const uint64_t mask3 = 0xFF;
- const uint64_t mask4 = ~mask1;
- const uint64_t shift = 0x08;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
- "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
-
- "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
-
- "psubush %[alpha], %[mask2], %[src0_lo] \n\t"
- "pshufh %[alpha], %[alpha], %[mask3] \n\t"
- "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t"
- "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t"
-
- "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
- "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
-
- "psubush %[alpha], %[mask2], %[src0_hi] \n\t"
- "pshufh %[alpha], %[alpha], %[mask3] \n\t"
- "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t"
- "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "and %[dest], %[dest], %[mask1] \n\t"
- "or %[dest], %[dest], %[mask4] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
- [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
- [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
- [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
- [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
- [shift] "f"(shift), [width] "r"(width)
- : "memory");
-}
-
-void BlendPlaneRow_MMI(const uint8_t* src0,
- const uint8_t* src1,
- const uint8_t* alpha,
- uint8_t* dst,
- int width) {
- uint64_t source0, source1, dest, alph;
- uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
- dest_lo;
- uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
- const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
- const uint64_t shift = 0x08;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
- "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
- "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
-
- "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
- "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
- "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
-
- "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t"
- "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t"
- "psubusb %[alpha_r], %[mask1], %[alpha] \n\t"
- "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t"
- "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t"
- "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t"
- "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t"
-
- "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t"
- "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[dest] \n\t"
- "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t"
- "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
-
- "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t"
- "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[dest] \n\t"
- "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t"
- "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
- "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
- [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
- [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
- [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
- [alpha_r] "=&f"(alpha_rev)
- : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
- [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
- [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
- : "memory");
-}
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
- const uint64_t mask0 = 0xFF;
- const uint64_t mask1 = 0xFF000000FF000000ULL;
- const uint64_t mask2 = ~mask1;
- const uint64_t shift = 0x08;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[src] \n\t"
- "punpckhbh %[src_hi], %[src], %[src] \n\t"
-
- "pshufh %[alpha], %[src_lo], %[mask0] \n\t"
- "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t"
- "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
- "pshufh %[alpha], %[src_hi], %[mask0] \n\t"
- "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t"
- "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "and %[dest], %[dest], %[mask2] \n\t"
- "and %[src], %[src], %[mask1] \n\t"
- "or %[dest], %[dest], %[src] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
- [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
- [width] "r"(width)
- : "memory");
-}
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
- int32_t* cumsum,
- const int32_t* previous_cumsum,
- int width) {
- int64_t row_sum[2] = {0, 0};
- uint64_t src, dest0, dest1, presrc0, presrc1, dest;
- const uint64_t mask = 0x0;
-
- __asm__ volatile(
- "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t"
- "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t"
-
- "1: \n\t"
- "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t"
- "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t"
-
- "punpcklbh %[src], %[src], %[mask] \n\t"
- "punpcklhw %[dest0], %[src], %[mask] \n\t"
- "punpckhhw %[dest1], %[src], %[mask] \n\t"
-
- "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t"
- "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t"
-
- "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t"
- "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t"
- "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t"
- "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t"
-
- "paddw %[dest0], %[row_sum0], %[presrc0] \n\t"
- "paddw %[dest1], %[row_sum1], %[presrc1] \n\t"
-
- "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
- "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t"
- "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x01 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
- [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
- [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
- [presrc1] "=&f"(presrc1)
- : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
- [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
- : "memory");
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_MMI(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int width,
- int source_y_fraction) {
- if (source_y_fraction == 0) {
- __asm__ volatile(
- "1: \n\t"
- "ld $t0, 0x0(%[src_ptr]) \n\t"
- "sd $t0, 0x0(%[dst_ptr]) \n\t"
- "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- :
- : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
- : "memory");
- return;
- }
- if (source_y_fraction == 128) {
- uint64_t uv = 0x0;
- uint64_t uv_stride = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t"
- "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t"
- "daddu $t0, %[src_ptr], %[stride] \n\t"
- "gsldrc1 %[uv_stride], 0x0($t0) \n\t"
- "gsldlc1 %[uv_stride], 0x7($t0) \n\t"
-
- "pavgb %[uv], %[uv], %[uv_stride] \n\t"
- "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t"
- "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
- [stride] "r"((int64_t)src_stride)
- : "memory");
- return;
- }
- const uint8_t* src_ptr1 = src_ptr + src_stride;
- uint64_t temp;
- uint64_t data[4];
- uint64_t zero = 0x0;
- uint64_t c0 = 0x0080008000800080;
- uint64_t fy0 = 0x0100010001000100;
- uint64_t shift = 0x8;
- __asm__ volatile(
- "pshufh %[fy1], %[fy1], %[zero] \n\t"
- "psubh %[fy0], %[fy0], %[fy1] \n\t"
- "1: \n\t"
- "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t"
- "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t"
- "punpcklbh %[d0], %[t0], %[zero] \n\t"
- "punpckhbh %[d1], %[t0], %[zero] \n\t"
- "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t"
- "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t"
- "punpcklbh %[d2], %[t0], %[zero] \n\t"
- "punpckhbh %[d3], %[t0], %[zero] \n\t"
-
- "pmullh %[d0], %[d0], %[fy0] \n\t"
- "pmullh %[d2], %[d2], %[fy1] \n\t"
- "paddh %[d0], %[d0], %[d2] \n\t"
- "paddh %[d0], %[d0], %[c0] \n\t"
- "psrlh %[d0], %[d0], %[shift] \n\t"
-
- "pmullh %[d1], %[d1], %[fy0] \n\t"
- "pmullh %[d3], %[d3], %[fy1] \n\t"
- "paddh %[d1], %[d1], %[d3] \n\t"
- "paddh %[d1], %[d1], %[c0] \n\t"
- "psrlh %[d1], %[d1], %[shift] \n\t"
-
- "packushb %[d0], %[d0], %[d1] \n\t"
- "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t"
- "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t"
- "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
- "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
- [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
- : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
- [dst_ptr] "r"(dst_ptr), [width] "r"(width),
- [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
- [shift] "f"(shift), [zero] "f"(zero)
- : "memory");
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- uint64_t source, dest0, dest1, dest;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
- ((shuffler[2] & 0x03) << 4) |
- ((shuffler[3] & 0x03) << 6);
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
-
- "punpcklbh %[dest0], %[src], %[mask0] \n\t"
- "pshufh %[dest0], %[dest0], %[mask1] \n\t"
- "punpckhbh %[dest1], %[src], %[mask0] \n\t"
- "pshufh %[dest1], %[dest1], %[mask1] \n\t"
- "packushb %[dest], %[dest0], %[dest1] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
- [dest1] "=&f"(dest1)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [width] "r"(width)
- : "memory");
-}
-
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_frame,
- int width) {
- uint64_t temp[3];
- uint64_t vu = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
- "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
- "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
- "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
- "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
- "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
- "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
- "punpcklbh %[tu], %[ty], %[vu] \n\t" // g
- "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
- "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
- "punpckhbh %[tu], %[ty], %[vu] \n\t" // g
- "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
- "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
- "daddiu %[src_y], %[src_y], 8 \n\t"
- "daddiu %[src_u], %[src_u], 4 \n\t"
- "daddiu %[src_v], %[src_v], 4 \n\t"
- "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
- [vu] "=&f"(vu)
- : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
- [dst_frame] "r"(dst_frame), [width] "r"(width)
- : "memory");
-}
-
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_frame,
- int width) {
- uint64_t temp[3];
- uint64_t vu = 0x0;
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
- "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
- "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
- "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
- "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
- "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
- "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
- "punpcklbh %[tu], %[vu], %[ty] \n\t" // g
- "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
- "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
- "punpckhbh %[tu], %[vu], %[ty] \n\t" // g
- "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
- "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
- "daddiu %[src_y], %[src_y], 8 \n\t"
- "daddiu %[src_u], %[src_u], 4 \n\t"
- "daddiu %[src_v], %[src_v], 4 \n\t"
- "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
- "daddiu %[width], %[width], -8 \n\t"
- "bgtz %[width], 1b \n\t"
- "nop \n\t"
- : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
- [vu] "=&f"(vu)
- : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
- [dst_frame] "r"(dst_frame), [width] "r"(width)
- : "memory");
-}
-
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
- uint64_t source, dest;
- const uint64_t mask0 = 0xff000000ff000000ULL;
- const uint64_t mask1 = ~mask0;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "and %[src], %[src], %[mask0] \n\t"
- "and %[dest], %[dest], %[mask1] \n\t"
- "or %[dest], %[src], %[dest] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(source), [dest] "=&f"(dest)
- : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [width] "r"(width)
- : "memory");
-}
-
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
- const uint64_t mask = 0xff000000ff000000ULL;
- const uint64_t shift = 0x18;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "and %[dest0], %[src], %[mask] \n\t"
- "psrlw %[dest0], %[dest0], %[shift] \n\t"
- "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
- "and %[dest1], %[src], %[mask] \n\t"
- "psrlw %[dest1], %[dest1], %[shift] \n\t"
- "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
-
- "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
- "and %[dest0], %[src], %[mask] \n\t"
- "psrlw %[dest0], %[dest0], %[shift] \n\t"
- "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
- "and %[dest1], %[src], %[mask] \n\t"
- "psrlw %[dest1], %[dest1], %[shift] \n\t"
- "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
- [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
- [shift] "f"(shift), [width] "r"(width)
- : "memory");
-}
-
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
- uint64_t source, dest0, dest1, dest;
- const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x00ffffff00ffffffULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
-
- "punpcklbh %[dest0], %[mask0], %[src] \n\t"
- "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
- "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
- "and %[dest], %[dest], %[mask1] \n\t"
- "or %[dest], %[dest], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
- "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
- "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
- "and %[dest], %[dest], %[mask1] \n\t"
- "or %[dest], %[dest], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
-
- "punpckhbh %[dest0], %[mask0], %[src] \n\t"
- "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
- "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
- "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
- "and %[dest], %[dest], %[mask1] \n\t"
- "or %[dest], %[dest], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
- "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
- "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
- "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
- "and %[dest], %[dest], %[mask1] \n\t"
- "or %[dest], %[dest], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
- [dest1] "=&f"(dest1)
- : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
- [mask1] "f"(mask1), [width] "r"(width)
- : "memory");
-}
-
-#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
deleted file mode 100644
index a12fa790..00000000
--- a/files/source/row_neon.cc
+++ /dev/null
@@ -1,2892 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <stdio.h>
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
- !defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.32 {d2[0]}, [%1]! \n" \
- "vld1.32 {d2[1]}, [%2]! \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.8 {d2}, [%1]! \n" \
- "vld1.8 {d3}, [%2]! \n" \
- "vpaddl.u8 q1, q1 \n" \
- "vrshrn.u16 d2, q1, #1 \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400 \
- "vld1.8 {d0}, [%0]! \n" \
- "vmov.u8 d2, #128 \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
- "vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 YUY2
-#define READYUY2 \
- "vld2.8 {d0, d2}, [%0]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 UYVY
-#define READUYVY \
- "vld2.8 {d2, d3}, [%0]! \n" \
- "vmov.u8 d0, d3 \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-#define YUVTORGB_SETUP \
- "vld1.8 {d24}, [%[kUVToRB]] \n" \
- "vld1.8 {d25}, [%[kUVToG]] \n" \
- "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
- "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
- "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
- "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
-
-#define YUVTORGB \
- "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
- "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
- "vmovl.u8 q0, d0 \n" /* Y */ \
- "vmovl.s16 q10, d1 \n" \
- "vmovl.s16 q0, d0 \n" \
- "vmul.s32 q10, q10, q15 \n" \
- "vmul.s32 q0, q0, q15 \n" \
- "vqshrun.s32 d0, q0, #16 \n" \
- "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
- "vadd.s16 d18, d19 \n" \
- "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
- "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
- "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
- "vaddw.u16 q1, q1, d16 \n" \
- "vaddw.u16 q10, q10, d17 \n" \
- "vaddw.u16 q3, q3, d18 \n" \
- "vqadd.s16 q8, q0, q13 \n" /* B */ \
- "vqadd.s16 q9, q0, q14 \n" /* R */ \
- "vqadd.s16 q0, q0, q4 \n" /* G */ \
- "vqadd.s16 q8, q8, q1 \n" /* B */ \
- "vqadd.s16 q9, q9, q10 \n" /* R */ \
- "vqsub.s16 q0, q0, q3 \n" /* G */ \
- "vqshrun.s16 d20, q8, #6 \n" /* B */ \
- "vqshrun.s16 d22, q9, #6 \n" /* R */ \
- "vqshrun.s16 d21, q0, #6 \n" /* G */
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READYUV444 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void I422ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- const uint8_t* src_a,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB
- "subs %5, %5, #8 \n"
- "vld1.8 {d23}, [%3]! \n"
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void I422ToRGBARow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-#define ARGBTORGB565 \
- "vshll.u8 q0, d22, #8 \n" /* R */ \
- "vshll.u8 q8, d21, #8 \n" /* G */ \
- "vshll.u8 q9, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #5 \n" /* RG */ \
- "vsri.16 q0, q9, #11 \n" /* RGB */
-
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-#define ARGBTOARGB1555 \
- "vshll.u8 q0, d23, #8 \n" /* A */ \
- "vshll.u8 q8, d22, #8 \n" /* R */ \
- "vshll.u8 q9, d21, #8 \n" /* G */ \
- "vshll.u8 q10, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #1 \n" /* AR */ \
- "vsri.16 q0, q9, #6 \n" /* ARG */ \
- "vsri.16 q0, q10, #11 \n" /* ARGB */
-
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB1555
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-#define ARGBTOARGB4444 \
- "vshr.u8 d20, d20, #4 \n" /* B */ \
- "vbic.32 d21, d21, d4 \n" /* G */ \
- "vshr.u8 d22, d22, #4 \n" /* R */ \
- "vbic.32 d23, d23, d4 \n" /* A */ \
- "vorr d0, d20, d21 \n" /* BG */ \
- "vorr d1, d22, d23 \n" /* RA */ \
- "vzip.u8 d0, d1 \n" /* BGRA */
-
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "vmov.u8 d4, #0x0f \n" // vbic bits to clear
- "1: \n"
-
- READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB4444
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- asm volatile(
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READYUV400 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
- [kUVToG] "r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- asm volatile(
- "vmov.u8 d23, #255 \n"
- "1: \n"
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d20", "d21", "d22", "d23");
-}
-
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
-
- YUVTORGB_SETUP
-
- "1: \n"
-
- READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
-
- YUVTORGB_SETUP
-
- "1: \n"
-
- READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
-}
-
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READYUY2 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READUYVY YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store U
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- asm volatile(
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load U
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
- uint8_t* dst_r,
- uint8_t* dst_g,
- uint8_t* dst_b,
- int width) {
- asm volatile(
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
- "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store R
- "vst1.8 {q1}, [%2]! \n" // store G
- "vst1.8 {q2}, [%3]! \n" // store B
- "bgt 1b \n"
- : "+r"(src_rgb), // %0
- "+r"(dst_r), // %1
- "+r"(dst_g), // %2
- "+r"(dst_b), // %3
- "+r"(width) // %4
- : // Input registers
- : "cc", "memory", "d0", "d1", "d2" // Clobber List
- );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
-void MergeRGBRow_NEON(const uint8_t* src_r,
- const uint8_t* src_g,
- const uint8_t* src_b,
- uint8_t* dst_rgb,
- int width) {
- asm volatile(
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load R
- "vld1.8 {q1}, [%1]! \n" // load G
- "vld1.8 {q2}, [%2]! \n" // load B
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
- "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
- "bgt 1b \n"
- : "+r"(src_r), // %0
- "+r"(src_g), // %1
- "+r"(src_b), // %2
- "+r"(dst_rgb), // %3
- "+r"(width) // %4
- : // Input registers
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
-}
-
-// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// SetRow writes 'width' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
- asm volatile(
- "vdup.8 q0, %2 \n" // duplicate 16 bytes
- "1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(width) // %1
- : "r"(v8) // %2
- : "cc", "memory", "q0");
-}
-
-// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
- asm volatile(
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
- "1: \n"
- "subs %1, %1, #4 \n" // 4 pixels per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(width) // %1
- : "r"(v32) // %2
- : "cc", "memory", "q0");
-}
-
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2 \n"
- "sub %0, #16 \n"
-
- "1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0");
-}
-
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- // Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
-
- "1: \n"
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "r12", "q0");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
-
- "1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0");
-}
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
- "1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
- asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
- "1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
- asm volatile(
- "1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
- // RGB24.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3" // Clobber List
- );
-}
-
-#define RGB565TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
- "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
- "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-#define ARGB1555TOARGB \
- "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
- "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
- "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
- "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
- "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
- "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
- "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
- "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
- "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
- "vorr.u8 q1, q1, q3 \n" /* R,A */ \
- "vorr.u8 q0, q0, q2 \n" /* B,G */
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
- "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
- "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-#define ARGB4444TOARGB \
- "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
- "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
- "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
- "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
- "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
- "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
- "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
- "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
-
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
-}
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_rgb24,
- int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
- // RGB24.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_raw), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
- asm volatile(
- "1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
- asm volatile(
- "1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
- );
-}
-
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
- );
-}
-
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
- int stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // stride + src_yuy2
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(stride_yuy2), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
- "d7" // Clobber List
- );
-}
-
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
- int stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // stride + src_uyvy
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(stride_uyvy), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
- "d7" // Clobber List
- );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- asm volatile(
- "vld1.8 {q2}, [%3] \n" // shuffler
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
-}
-
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_yuy2,
- int width) {
- asm volatile(
- "1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uyvy,
- int width) {
- asm volatile(
- "1: \n"
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_rgb565,
- int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb565), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- const uint32_t dither4,
- int width) {
- asm volatile(
- "vdup.32 d2, %2 \n" // dither4
- "1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d20, d20, d2 \n"
- "vqadd.u8 d21, d21, d2 \n"
- "vqadd.u8 d22, d22, d2 \n" // add for dither
- ARGBTORGB565
- "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
- "bgt 1b \n"
- : "+r"(dst_rgb) // %0
- : "r"(src_argb), // %1
- "r"(dither4), // %2
- "r"(width) // %3
- : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb1555,
- int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb1555), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb4444,
- int width) {
- asm volatile(
- "vmov.u8 d4, #0x0f \n" // bits to clear with
- // vbic.
- "1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb4444), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "vmov.u8 d24, #112 \n" // UB / VR 0.875
- // coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
-
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
-
- "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
-
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
- "q15");
-}
-
-// clang-format off
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
- "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
- "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
- "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
- "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
- "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
- "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
- "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
- "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
- "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
- "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
-// clang-format on
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q3, q2, q1)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(src_stride_bgra), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(src_stride_abgr), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
- int src_stride_rgba,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(src_stride_rgba), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(src_stride_rgb24), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RAWToUVRow_NEON(const uint8_t* src_raw,
- int src_stride_raw,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(src_stride_raw), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
- // coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(src_stride_rgb565), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
- "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
- // coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(src_stride_argb1555), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
- "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
- // coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(src_stride_argb4444), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
- "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
- // General purpose row blend.
- "1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(y1_fraction) // %4
- :
- : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "subs %3, #8 \n"
- "blt 89f \n"
- // Blend 8 pixels.
- "8: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
-
- "89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
-
- // Blend 1 pixels.
- "1: \n"
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
-
- "99: \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // Attenuate 8 pixels.
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
- "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
- "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
- "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
- int scale,
- int interval_size,
- int interval_offset,
- int width) {
- asm volatile(
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
-
- // 8 pixel loop.
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
- "vqdmulh.s16 q0, q0, q8 \n" // b * scale
- "vqdmulh.s16 q1, q1, q8 \n" // g
- "vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- uint32_t value) {
- asm volatile(
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
-
- // 8 pixel loop.
- "1: \n"
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
- "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
- "vqrdmulh.s16 q11, q11, d0[1] \n" // g
- "vqrdmulh.s16 q12, q12, d0[2] \n" // r
- "vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
- asm volatile(
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
- asm volatile(
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
- "q14", "q15");
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
-// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const int8_t* matrix_argb,
- int width) {
- asm volatile(
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
-
- "1: \n"
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q11, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
- "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
- "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
- "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 8 pixel loop.
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 8 pixel loop.
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 8 pixel loop.
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- // 16 pixel loop.
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1");
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1");
-}
-
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-void SobelXRow_NEON(const uint8_t* src_y0,
- const uint8_t* src_y1,
- const uint8_t* src_y2,
- uint8_t* dst_sobelx,
- int width) {
- asm volatile(
- "1: \n"
- "vld1.8 {d0}, [%0],%5 \n" // top
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- : "r"(2), // %5
- "r"(6) // %6
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-void SobelYRow_NEON(const uint8_t* src_y0,
- const uint8_t* src_y1,
- uint8_t* dst_sobely,
- int width) {
- asm volatile(
- "1: \n"
- "vld1.8 {d0}, [%0],%4 \n" // left
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%0],%5 \n" // right
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- : "r"(1), // %4
- "r"(6) // %5
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// %y passes a float as a scalar vector for vector * scalar multiply.
-// the regoster must be d0 to d15 and indexed with [0] or [1] to access
-// the float in the first or second float of the d-reg
-
-void HalfFloat1Row_NEON(const uint16_t* src,
- uint16_t* dst,
- float /*unused*/,
- int width) {
- asm volatile(
-
- "1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(1.9259299444e-34f) // %3
- : "cc", "memory", "q1", "q2", "q3");
-}
-
-void HalfFloatRow_NEON(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- asm volatile(
-
- "1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(scale * 1.9259299444e-34f) // %3
- : "cc", "memory", "q1", "q2", "q3");
-}
-
-void ByteToFloatRow_NEON(const uint8_t* src,
- float* dst,
- float scale,
- int width) {
- asm volatile(
-
- "1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 bytes
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u8 q1, d2 \n" // 8 shorts
- "vmovl.u16 q2, d2 \n" // 8 ints
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // scale
- "vmul.f32 q3, q3, %y3 \n"
- "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(scale) // %3
- : "cc", "memory", "q1", "q2", "q3");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint16_t* src0,
- const uint16_t* src1,
- const uint16_t* src2,
- const uint16_t* src3,
- const uint16_t* src4,
- uint32_t* dst,
- int width) {
- asm volatile(
- "vmov.u16 d6, #4 \n" // constant 4
- "vmov.u16 d7, #6 \n" // constant 6
-
- "1: \n"
- "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
- "vld1.16 {q2}, [%4]! \n"
- "vaddl.u16 q0, d2, d4 \n" // * 1
- "vaddl.u16 q1, d3, d5 \n" // * 1
- "vld1.16 {q2}, [%1]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "vld1.16 {q2}, [%2]! \n"
- "vmlal.u16 q0, d4, d7 \n" // * 6
- "vmlal.u16 q1, d5, d7 \n" // * 6
- "vld1.16 {q2}, [%3]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "subs %6, %6, #8 \n" // 8 processed per loop
- "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
- "bgt 1b \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(src2), // %2
- "+r"(src3), // %3
- "+r"(src4), // %4
- "+r"(dst), // %5
- "+r"(width) // %6
- :
- : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
- const uint32_t* src1 = src + 1;
- const uint32_t* src2 = src + 2;
- const uint32_t* src3 = src + 3;
- asm volatile(
- "vmov.u32 q10, #4 \n" // constant 4
- "vmov.u32 q11, #6 \n" // constant 6
-
- "1: \n"
- "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
- "vld1.32 {q2}, [%0] \n"
- "vadd.u32 q0, q0, q1 \n" // * 1
- "vadd.u32 q1, q1, q2 \n" // * 1
- "vld1.32 {q2, q3}, [%2]! \n"
- "vmla.u32 q0, q2, q11 \n" // * 6
- "vmla.u32 q1, q3, q11 \n" // * 6
- "vld1.32 {q2, q3}, [%1]! \n"
- "vld1.32 {q8, q9}, [%3]! \n"
- "vadd.u32 q2, q2, q8 \n" // add rows for * 4
- "vadd.u32 q3, q3, q9 \n"
- "vmla.u32 q0, q2, q10 \n" // * 4
- "vmla.u32 q1, q3, q10 \n" // * 4
- "subs %5, %5, #8 \n" // 8 processed per loop
- "vqshrn.u32 d0, q0, #8 \n" // round and pack
- "vqshrn.u32 d1, q1, #8 \n"
- "vst1.u16 {q0}, [%4]! \n" // store 8 samples
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(src1), // %1
- "+r"(src2), // %2
- "+r"(src3), // %3
- "+r"(dst), // %4
- "+r"(width) // %5
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_yuv24,
- int width) {
- asm volatile(
- "1: \n"
- "vld1.8 {q2}, [%0]! \n" // load 16 Y values
- "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
- "vmov d1, d0 \n"
- "vzip.u8 d0, d1 \n" // VV
- "vmov d3, d2 \n"
- "vzip.u8 d2, d3 \n" // UU
- "subs %3, %3, #16 \n" // 16 pixels per loop
- "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
- "vst3.8 {d1, d3, d5}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_yuv24), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2");
-}
-
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_uv,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
- // pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
- // pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
- // pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
- "vqrshrun.s16 d0, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
- "bgt 1b \n"
- : "+r"(src_ayuv), // %0
- "+r"(src_stride_ayuv), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-}
-
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_vu,
- int width) {
- asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
- // pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
- // pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
- // pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
- "vqrshrun.s16 d1, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
- "bgt 1b \n"
- : "+r"(src_ayuv), // %0
- "+r"(src_stride_ayuv), // %1
- "+r"(dst_vu), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-}
-
-// Copy row of AYUV Y's into Y.
-// Similar to ARGBExtractAlphaRow_NEON
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
- asm volatile(
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
- "bgt 1b \n"
- : "+r"(src_ayuv), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Convert biplanar UV channel of NV12 to NV21
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
- asm volatile(
- "1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
- "vld2.8 {d1, d3}, [%0]! \n"
- "vorr.u8 q2, q0, q0 \n" // move U after V
- "subs %2, %2, #16 \n" // 16 pixels per loop
- "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_vu), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2");
-}
-
-#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
deleted file mode 100644
index f5cbb470..00000000
--- a/files/source/row_neon64.cc
+++ /dev/null
@@ -1,3036 +0,0 @@
-/*
- * Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v1.s}[0], [%1], #4 \n" \
- "ld1 {v1.s}[1], [%2], #4 \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v1.d}[0], [%1], #8 \n" \
- "ld1 {v1.d}[1], [%2], #8 \n" \
- "uaddlp v1.8h, v1.16b \n" \
- "rshrn v1.8b, v1.8h, #1 \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "movi v1.8b , #128 \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v3.8b, v2.8b, v2.8b \n" \
- "uzp2 v1.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-// Read 8 YUY2
-#define READYUY2 \
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
- "uzp2 v3.8b, v1.8b, v1.8b \n" \
- "uzp1 v1.8b, v1.8b, v1.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-// Read 8 UYVY
-#define READUYVY \
- "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
- "orr v0.8b, v3.8b, v3.8b \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-#define YUVTORGB_SETUP \
- "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
- "ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
-
-#define YUVTORGB(vR, vG, vB) \
- "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
- "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
- "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
- "ushll v0.4s, v0.4h, #0 \n" \
- "mul v3.4s, v3.4s, v31.4s \n" \
- "mul v0.4s, v0.4s, v31.4s \n" \
- "sqshrun v0.4h, v0.4s, #16 \n" \
- "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
- "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
- "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
- "uxtl v2.8h, v2.8b \n" \
- "uxtl v1.8h, v1.8b \n" /* Extract U */ \
- "mul v3.8h, v1.8h, v27.8h \n" \
- "mul v5.8h, v1.8h, v29.8h \n" \
- "mul v6.8h, v2.8h, v30.8h \n" \
- "mul v7.8h, v2.8h, v28.8h \n" \
- "sqadd v6.8h, v6.8h, v5.8h \n" \
- "sqadd " #vB \
- ".8h, v24.8h, v0.8h \n" /* B */ \
- "sqadd " #vG \
- ".8h, v25.8h, v0.8h \n" /* G */ \
- "sqadd " #vR \
- ".8h, v26.8h, v0.8h \n" /* R */ \
- "sqadd " #vB ".8h, " #vB \
- ".8h, v3.8h \n" /* B */ \
- "sqsub " #vG ".8h, " #vG \
- ".8h, v6.8h \n" /* G */ \
- "sqadd " #vR ".8h, " #vR \
- ".8h, v7.8h \n" /* R */ \
- "sqshrun " #vB ".8b, " #vB \
- ".8h, #6 \n" /* B */ \
- "sqshrun " #vG ".8b, " #vG \
- ".8h, #6 \n" /* G */ \
- "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
- READYUV444
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void I422ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- const uint8_t* src_a,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void I422ToRGBARow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
- "1: \n"
- READYUV422
- YUVTORGB(v23, v22, v21)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-#define ARGBTORGB565 \
- "shll v0.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
- "shll v20.8h, v20.8b, #8 \n" /* B */ \
- "sri v0.8h, v21.8h, #5 \n" /* RG */ \
- "sri v0.8h, v20.8h, #11 \n" /* RGB */
-
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
-}
-
-#define ARGBTOARGB1555 \
- "shll v0.8h, v23.8b, #8 \n" /* A */ \
- "shll v22.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
- "shll v20.8h, v20.8b, #8 \n" /* B */ \
- "sri v0.8h, v22.8h, #1 \n" /* AR */ \
- "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
- "sri v0.8h, v20.8h, #11 \n" /* ARGB */
-
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
-}
-
-#define ARGBTOARGB4444 \
- /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
- "ushr v20.8b, v20.8b, #4 \n" /* B */ \
- "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
- "ushr v22.8b, v22.8b, #4 \n" /* R */ \
- "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
- "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
- "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
- "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
-
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v4.16b, #0x0f \n" // bits to clear with vbic.
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "movi v23.8b, #255 \n"
- ARGBTOARGB4444
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUV400
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- asm volatile(
- "movi v23.8b, #255 \n"
- "1: \n"
- "ld1 {v20.8b}, [%0], #8 \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v20", "v21", "v22", "v23");
-}
-
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READNV12
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READNV21
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READNV12
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READNV21
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB(
- v22, v21,
- v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
-}
-
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUY2
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READUYVY
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store U
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "b.gt 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
- uint8_t* dst_r,
- uint8_t* dst_g,
- uint8_t* dst_b,
- int width) {
- asm volatile(
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store R
- "st1 {v1.16b}, [%2], #16 \n" // store G
- "st1 {v2.16b}, [%3], #16 \n" // store B
- "b.gt 1b \n"
- : "+r"(src_rgb), // %0
- "+r"(dst_r), // %1
- "+r"(dst_g), // %2
- "+r"(dst_b), // %3
- "+r"(width) // %4
- : // Input registers
- : "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
-void MergeRGBRow_NEON(const uint8_t* src_r,
- const uint8_t* src_g,
- const uint8_t* src_b,
- uint8_t* dst_rgb,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load R
- "ld1 {v1.16b}, [%1], #16 \n" // load G
- "ld1 {v2.16b}, [%2], #16 \n" // load B
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
- "b.gt 1b \n"
- : "+r"(src_r), // %0
- "+r"(src_g), // %1
- "+r"(src_b), // %2
- "+r"(dst_rgb), // %3
- "+r"(width) // %4
- : // Input registers
- : "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
-}
-
-// Copy multiple of 32.
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- "1: \n"
- "ldp q0, q1, [%0], #32 \n"
- "subs %w2, %w2, #32 \n" // 32 processed per loop
- "stp q0, q1, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-// SetRow writes 'width' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
- asm volatile(
- "dup v0.16b, %w2 \n" // duplicate 16 bytes
- "1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
- : "+r"(dst), // %0
- "+r"(width) // %1
- : "r"(v8) // %2
- : "cc", "memory", "v0");
-}
-
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
- asm volatile(
- "dup v0.4s, %w2 \n" // duplicate 4 ints
- "1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
- : "+r"(dst), // %0
- "+r"(width) // %1
- : "r"(v32) // %2
- : "cc", "memory", "v0");
-}
-
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- // Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "rev64 v0.16b, v0.16b \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
-}
-
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- // Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
- asm volatile(
- // Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "rev64 v0.4s, v0.4s \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
-}
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "movi v4.8b, #255 \n" // Alpha
- "1: \n"
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
- asm volatile(
- "movi v5.8b, #255 \n" // Alpha
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
-}
-
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
- asm volatile(
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-
-#define RGB565TOARGB \
- "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
- "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
- "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
- "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
- "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
- "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
- "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
- "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
- "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
- "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
- "dup v2.2D, v0.D[1] \n" /* R */
-
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n" // Alpha
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
- );
-}
-
-#define ARGB1555TOARGB \
- "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
- "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
- "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
- \
- "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
- "xtn2 v3.16b, v2.8h \n" \
- \
- "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
- "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
- \
- "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
- "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
- \
- "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
- "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
- "dup v1.2D, v0.D[1] \n" \
- "dup v3.2D, v2.D[1] \n"
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB \
- "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
- "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
- "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
- \
- "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
- "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
- \
- "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
- "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
- \
- "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
- "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
- "dup v1.2D, v0.D[1] \n" /* G */
-
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n" // Alpha
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-#define ARGB4444TOARGB \
- "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
- "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
- "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
- "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
- "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
- "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
- "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
- "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
- "dup v0.2D, v2.D[1] \n" \
- "dup v1.2D, v3.D[1] \n"
-
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_rgb24,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
- // RGB24.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
- asm volatile(
- "1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v4.8b, v2.8b, v2.8b \n" // mov g
- "orr v5.8b, v1.8b, v1.8b \n" // mov b
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_raw), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
-}
-
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
- int stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(src_yuy2b), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
- "v7" // Clobber List
- );
-}
-
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
- int stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(src_uyvyb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
- "v7" // Clobber List
- );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- asm volatile(
- "ld1 {v2.16b}, [%3] \n" // shuffler
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
-}
-
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_yuy2,
- int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "orr v2.8b, v1.8b, v1.8b \n"
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uyvy,
- int width) {
- asm volatile(
- "1: \n"
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "orr v3.8b, v2.8b, v2.8b \n"
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_rgb565,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb565), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- const uint32_t dither4,
- int width) {
- asm volatile(
- "dup v1.4s, %w2 \n" // dither4
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
- "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
- : "+r"(dst_rgb) // %0
- : "r"(src_argb), // %1
- "r"(dither4), // %2
- "r"(width) // %3
- : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb1555,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB1555.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb1555), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb4444,
- int width) {
- asm volatile(
- "movi v4.16b, #0x0f \n" // bits to clear with
- // vbic.
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB4444.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb4444), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #15 \n" // B * 0.11400 coefficient
- "movi v5.8b, #75 \n" // G * 0.58700 coefficient
- "movi v6.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- asm volatile(
- "movi v24.8b, #112 \n" // UB / VR 0.875
- // coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- // pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
-
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
-
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
-
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
- "v27", "v28", "v29");
-}
-
-#define RGBTOUV_SETUP_REG \
- "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
- "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
- "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
- "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
- "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
- "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-// clang-format off
-#define RGBTOUV(QB, QG, QR) \
- "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
- "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
- "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
- "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
- "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
- "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
- "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
- "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
- "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
- "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
-// clang-format on
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-// TODO(fbarchard): consider ptrdiff_t for all strides.
-
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_argb_1 = src_argb + src_stride_argb;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
-
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_argb_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_argb_1 = src_argb + src_stride_argb;
- asm volatile (
- "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
- "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
- "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
- "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
- "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
- "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_argb_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
- int src_stride_bgra,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
- "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(src_bgra_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
- int src_stride_abgr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v3.8h, #1 \n" // 2x average
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v0.8h, v2.8h, v1.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(src_abgr_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
- int src_stride_rgba,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(src_rgba_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
- int src_stride_rgb24,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(src_rgb24_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-void RAWToUVRow_NEON(const uint8_t* src_raw,
- int src_stride_raw,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_raw_1 = src_raw + src_stride_raw;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
- "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
- "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v2.8h, v2.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v0.8h, v0.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
- RGBTOUV(v2.8h, v1.8h, v0.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(src_raw_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
- int src_stride_rgb565,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
- asm volatile(
- "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
- // 2
- "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
- "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
- "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
- "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
- "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v17.D[0] \n"
- "ins v18.D[1], v19.D[0] \n"
- "ins v20.D[1], v21.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v18.8h, #1 \n"
- "urshr v6.8h, v20.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v16.8h, v4.8h, v22.8h \n" // B
- "mls v16.8h, v5.8h, v23.8h \n" // G
- "mls v16.8h, v6.8h, v24.8h \n" // R
- "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
- "mul v17.8h, v6.8h, v22.8h \n" // R
- "mls v17.8h, v5.8h, v26.8h \n" // G
- "mls v17.8h, v4.8h, v25.8h \n" // B
- "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(src_rgb565_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27");
-}
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
- int src_stride_argb1555,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
- asm volatile(
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(src_argb1555_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
- "v28");
-}
-
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
- asm volatile(
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(src_argb4444_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
- "v28"
-
- );
-}
-
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
- "v27");
-}
-
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
-}
-
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint8_t* src_ptr1 = src_ptr + src_stride;
- asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
-
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
- // General purpose row blend.
- "1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_ptr1), // %2
- "+r"(dst_width), // %3
- "+r"(y1_fraction), // %4
- "+r"(y0_fraction) // %5
- :
- : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
- // Blend 8 pixels.
- "8: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
- // pixels
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
- // pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- // pixels
- "b.ge 8b \n"
-
- "89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
-
- // Blend 1 pixels.
- "1: \n"
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "subs %w3, %w3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "b.ge 1b \n"
-
- "99: \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18");
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // Attenuate 8 pixels.
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
- int scale,
- int interval_size,
- int interval_offset,
- int width) {
- asm volatile(
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
-
- // 8 pixel loop.
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- uint32_t value) {
- asm volatile(
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
-
- // 8 pixel loop.
- "1: \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
- asm volatile(
- "movi v24.8b, #15 \n" // B * 0.11400 coefficient
- "movi v25.8b, #75 \n" // G * 0.58700 coefficient
- "movi v26.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
- "orr v1.8b, v0.8b, v0.8b \n" // G
- "orr v2.8b, v0.8b, v0.8b \n" // R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
- asm volatile(
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
-// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const int8_t* matrix_argb,
- int width) {
- asm volatile(
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
-
- "1: \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v22", "v23", "v24", "v25");
-}
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 8 pixel loop.
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 8 pixel loop.
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- // 8 pixel loop.
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_y,
- int width) {
- asm volatile(
- // 16 pixel loop.
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "b.gt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1");
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-void SobelXRow_NEON(const uint8_t* src_y0,
- const uint8_t* src_y1,
- const uint8_t* src_y2,
- uint8_t* dst_sobelx,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.8b}, [%0],%5 \n" // top
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %w4, %w4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "b.gt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- : "r"(2LL), // %5
- "r"(6LL) // %6
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-void SobelYRow_NEON(const uint8_t* src_y0,
- const uint8_t* src_y1,
- uint8_t* dst_sobely,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.8b}, [%0],%4 \n" // left
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%0],%5 \n" // right
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %w3, %w3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "b.gt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- : "r"(1LL), // %4
- "r"(6LL) // %5
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-// Caveat - rounds float to half float whereas scaling version truncates.
-void HalfFloat1Row_NEON(const uint16_t* src,
- uint16_t* dst,
- float /*unused*/,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n" // 8 half floats
- "fcvtn2 v1.8h, v3.4s \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3");
-}
-
-void HalfFloatRow_NEON(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(scale * 1.9259299444e-34f) // %3
- : "cc", "memory", "v1", "v2", "v3");
-}
-
-void ByteToFloatRow_NEON(const uint8_t* src,
- float* dst,
- float scale,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v1.8h, v1.8b \n" // 8 shorts
- "uxtl v2.4s, v1.4h \n" // 8 ints
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(scale) // %3
- : "cc", "memory", "v1", "v2", "v3");
-}
-
-float ScaleMaxSamples_NEON(const float* src,
- float* dst,
- float scale,
- int width) {
- float fmax;
- asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n"
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
- "fmax v5.4s, v5.4s, v1.4s \n" // max
- "fmax v6.4s, v6.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "fmax v5.4s, v5.4s, v6.4s \n" // max
- "fmaxv %s3, v5.4s \n" // signed max acculator
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width), // %2
- "=w"(fmax) // %3
- : "w"(scale) // %4
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
- return fmax;
-}
-
-float ScaleSumSamples_NEON(const float* src,
- float* dst,
- float scale,
- int width) {
- float fsum;
- asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n" // max
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
- "fmla v6.4s, v2.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "faddp v5.4s, v5.4s, v6.4s \n"
- "faddp v5.4s, v5.4s, v5.4s \n"
- "faddp %3.4s, v5.4s, v5.4s \n" // sum
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width), // %2
- "=w"(fsum) // %3
- : "w"(scale) // %4
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
- return fsum;
-}
-
-void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(scale) // %3
- : "cc", "memory", "v1", "v2");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint16_t* src0,
- const uint16_t* src1,
- const uint16_t* src2,
- const uint16_t* src3,
- const uint16_t* src4,
- uint32_t* dst,
- int width) {
- asm volatile(
- "movi v6.8h, #4 \n" // constant 4
- "movi v7.8h, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
- "ld1 {v2.8h}, [%4], #16 \n"
- "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
- "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
- "ld1 {v2.8h}, [%1], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "ld1 {v2.8h}, [%2], #16 \n"
- "umlal v0.4s, v2.4h, v7.4h \n" // * 6
- "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
- "ld1 {v2.8h}, [%3], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "subs %w6, %w6, #8 \n" // 8 processed per loop
- "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
- "b.gt 1b \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(src2), // %2
- "+r"(src3), // %3
- "+r"(src4), // %4
- "+r"(dst), // %5
- "+r"(width) // %6
- :
- : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
- const uint32_t* src1 = src + 1;
- const uint32_t* src2 = src + 2;
- const uint32_t* src3 = src + 3;
- asm volatile(
- "movi v6.4s, #4 \n" // constant 4
- "movi v7.4s, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
- "add v0.4s, v0.4s, v1.4s \n" // * 1
- "add v1.4s, v1.4s, v2.4s \n" // * 1
- "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
- "mla v0.4s, v2.4s, v7.4s \n" // * 6
- "mla v1.4s, v3.4s, v7.4s \n" // * 6
- "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
- "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
- "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
- "add v3.4s, v3.4s, v5.4s \n"
- "mla v0.4s, v2.4s, v6.4s \n" // * 4
- "mla v1.4s, v3.4s, v6.4s \n" // * 4
- "subs %w5, %w5, #8 \n" // 8 processed per loop
- "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
- "uqrshrn2 v0.8h, v1.4s, #8 \n"
- "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(src1), // %1
- "+r"(src2), // %2
- "+r"(src3), // %3
- "+r"(dst), // %4
- "+r"(width) // %5
- : "r"(32LL) // %6
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_yuv24,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
- "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
- "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
- "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
- "subs %w3, %w3, #16 \n" // 16 pixels per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_yuv24), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2");
-}
-
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_uv,
- int width) {
- const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
- asm volatile(
-
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels.
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v2.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
- "b.gt 1b \n"
- : "+r"(src_ayuv), // %0
- "+r"(src_ayuv_1), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
- int src_stride_ayuv,
- uint8_t* dst_vu,
- int width) {
- const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
- asm volatile(
-
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels.
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v1.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
- "b.gt 1b \n"
- : "+r"(src_ayuv), // %0
- "+r"(src_ayuv_1), // %1
- "+r"(dst_vu), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Copy row of AYUV Y's into Y
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
- "b.gt 1b \n"
- : "+r"(src_ayuv), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void FloatDivToByteRow_NEON(const float* src_weights,
- const float* src_values,
- uint8_t* dst_out,
- uint8_t* dst_mask,
- int width) {
- asm volatile(
- "movi v0.4s, #0 \n"
-
- "1: \n"
- "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights
- "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values
- "subs %w4, %w4, #8 \n" // 8 pixels per loop
-
- "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights
- "fdiv v2.4s, v4.4s, v2.4s \n"
-
- "fcvtas v1.4s, v1.4s \n" // float to int
- "fcvtas v2.4s, v2.4s \n" // float to int
- "uqxtn v1.4h, v1.4s \n" // 8 shorts
- "uqxtn2 v1.8h, v2.4s \n"
- "uqxtn v1.8b, v1.8h \n" // 8 bytes
-
- "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
-
- "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
- "fcmgt v6.4s, v2.4s, v0.4s \n"
- "uqxtn v5.4h, v5.4s \n" // 8 shorts
- "uqxtn2 v5.8h, v6.4s \n"
- "uqxtn v5.8b, v1.8h \n" // 8 bytes
-
- "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
-
- "b.gt 1b \n"
- : "+r"(src_weights), // %0
- "+r"(src_values), // %1
- "+r"(dst_out), // %2
- "+r"(dst_mask), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Convert biplanar UV channel of NV12 to NV21
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
- "orr v2.16b, v0.16b, v0.16b \n" // move U after V
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_vu), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2");
-}
-
-#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
deleted file mode 100644
index 17831372..00000000
--- a/files/source/scale_any.cc
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- * Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h> // For memset/memcpy
-
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
- int dx) { \
- int r = dst_width & MASK; \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
- } \
- TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
- }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MMI
-CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
- ScaleARGBFilterCols_NEON,
- ScaleARGBFilterCols_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
- ScaleARGBFilterCols_MSA,
- ScaleARGBFilterCols_C,
- 4,
- 7)
-#endif
-#undef CANY
-
-// Fixed scale down.
-// Mask may be non-power of 2, so use MOD
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
- int dst_width) { \
- int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r); \
- }
-
-// Fixed scale down for odd source width. Used by I420Blend subsampling.
-// Since dst_width is (width + 1) / 2, this function scales one less pixel
-// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
- int dst_width) { \
- int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
- int n = (dst_width - 1) - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r + 1); \
- }
-
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3,
- ScaleRowDown2Linear_SSSE3,
- ScaleRowDown2Linear_C,
- 2,
- 1,
- 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3,
- ScaleRowDown2Box_SSSE3,
- ScaleRowDown2Box_C,
- 2,
- 1,
- 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3,
- ScaleRowDown2Box_SSSE3,
- ScaleRowDown2Box_Odd_C,
- 2,
- 1,
- 15)
-#endif
-#ifdef HAS_SCALEROWDOWN2_AVX2
-SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2,
- ScaleRowDown2Linear_AVX2,
- ScaleRowDown2Linear_C,
- 2,
- 1,
- 31)
-SDANY(ScaleRowDown2Box_Any_AVX2,
- ScaleRowDown2Box_AVX2,
- ScaleRowDown2Box_C,
- 2,
- 1,
- 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2,
- ScaleRowDown2Box_AVX2,
- ScaleRowDown2Box_Odd_C,
- 2,
- 1,
- 31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_NEON
-SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON,
- ScaleRowDown2Linear_NEON,
- ScaleRowDown2Linear_C,
- 2,
- 1,
- 15)
-SDANY(ScaleRowDown2Box_Any_NEON,
- ScaleRowDown2Box_NEON,
- ScaleRowDown2Box_C,
- 2,
- 1,
- 15)
-SDODD(ScaleRowDown2Box_Odd_NEON,
- ScaleRowDown2Box_NEON,
- ScaleRowDown2Box_Odd_C,
- 2,
- 1,
- 15)
-#endif
-#ifdef HAS_SCALEROWDOWN2_MSA
-SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_MSA,
- ScaleRowDown2Linear_MSA,
- ScaleRowDown2Linear_C,
- 2,
- 1,
- 31)
-SDANY(ScaleRowDown2Box_Any_MSA,
- ScaleRowDown2Box_MSA,
- ScaleRowDown2Box_C,
- 2,
- 1,
- 31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_MMI
-SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
-SDANY(ScaleRowDown2Linear_Any_MMI,
- ScaleRowDown2Linear_MMI,
- ScaleRowDown2Linear_C,
- 2,
- 1,
- 7)
-SDANY(ScaleRowDown2Box_Any_MMI,
- ScaleRowDown2Box_MMI,
- ScaleRowDown2Box_C,
- 2,
- 1,
- 7)
-SDODD(ScaleRowDown2Box_Odd_MMI,
- ScaleRowDown2Box_MMI,
- ScaleRowDown2Box_Odd_C,
- 2,
- 1,
- 7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_SSSE3
-SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3,
- ScaleRowDown4Box_SSSE3,
- ScaleRowDown4Box_C,
- 4,
- 1,
- 7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_AVX2
-SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2,
- ScaleRowDown4Box_AVX2,
- ScaleRowDown4Box_C,
- 4,
- 1,
- 15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_NEON
-SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON,
- ScaleRowDown4Box_NEON,
- ScaleRowDown4Box_C,
- 4,
- 1,
- 7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_MSA
-SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_MSA,
- ScaleRowDown4Box_MSA,
- ScaleRowDown4Box_C,
- 4,
- 1,
- 15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_MMI
-SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_MMI,
- ScaleRowDown4Box_MMI,
- ScaleRowDown4Box_C,
- 4,
- 1,
- 7)
-#endif
-#ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3,
- ScaleRowDown34_SSSE3,
- ScaleRowDown34_C,
- 4 / 3,
- 1,
- 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
- ScaleRowDown34_0_Box_SSSE3,
- ScaleRowDown34_0_Box_C,
- 4 / 3,
- 1,
- 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
- ScaleRowDown34_1_Box_SSSE3,
- ScaleRowDown34_1_Box_C,
- 4 / 3,
- 1,
- 23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON,
- ScaleRowDown34_NEON,
- ScaleRowDown34_C,
- 4 / 3,
- 1,
- 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON,
- ScaleRowDown34_0_Box_NEON,
- ScaleRowDown34_0_Box_C,
- 4 / 3,
- 1,
- 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON,
- ScaleRowDown34_1_Box_NEON,
- ScaleRowDown34_1_Box_C,
- 4 / 3,
- 1,
- 23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_MSA
-SDANY(ScaleRowDown34_Any_MSA,
- ScaleRowDown34_MSA,
- ScaleRowDown34_C,
- 4 / 3,
- 1,
- 47)
-SDANY(ScaleRowDown34_0_Box_Any_MSA,
- ScaleRowDown34_0_Box_MSA,
- ScaleRowDown34_0_Box_C,
- 4 / 3,
- 1,
- 47)
-SDANY(ScaleRowDown34_1_Box_Any_MSA,
- ScaleRowDown34_1_Box_MSA,
- ScaleRowDown34_1_Box_C,
- 4 / 3,
- 1,
- 47)
-#endif
-#ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3,
- ScaleRowDown38_SSSE3,
- ScaleRowDown38_C,
- 8 / 3,
- 1,
- 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
- ScaleRowDown38_3_Box_SSSE3,
- ScaleRowDown38_3_Box_C,
- 8 / 3,
- 1,
- 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
- ScaleRowDown38_2_Box_SSSE3,
- ScaleRowDown38_2_Box_C,
- 8 / 3,
- 1,
- 5)
-#endif
-#ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON,
- ScaleRowDown38_NEON,
- ScaleRowDown38_C,
- 8 / 3,
- 1,
- 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON,
- ScaleRowDown38_3_Box_NEON,
- ScaleRowDown38_3_Box_C,
- 8 / 3,
- 1,
- 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON,
- ScaleRowDown38_2_Box_NEON,
- ScaleRowDown38_2_Box_C,
- 8 / 3,
- 1,
- 11)
-#endif
-#ifdef HAS_SCALEROWDOWN38_MSA
-SDANY(ScaleRowDown38_Any_MSA,
- ScaleRowDown38_MSA,
- ScaleRowDown38_C,
- 8 / 3,
- 1,
- 11)
-SDANY(ScaleRowDown38_3_Box_Any_MSA,
- ScaleRowDown38_3_Box_MSA,
- ScaleRowDown38_3_Box_C,
- 8 / 3,
- 1,
- 11)
-SDANY(ScaleRowDown38_2_Box_Any_MSA,
- ScaleRowDown38_2_Box_MSA,
- ScaleRowDown38_2_Box_C,
- 8 / 3,
- 1,
- 11)
-#endif
-
-#ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2,
- ScaleARGBRowDown2_SSE2,
- ScaleARGBRowDown2_C,
- 2,
- 4,
- 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
- ScaleARGBRowDown2Linear_SSE2,
- ScaleARGBRowDown2Linear_C,
- 2,
- 4,
- 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2,
- ScaleARGBRowDown2Box_SSE2,
- ScaleARGBRowDown2Box_C,
- 2,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON,
- ScaleARGBRowDown2_NEON,
- ScaleARGBRowDown2_C,
- 2,
- 4,
- 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON,
- ScaleARGBRowDown2Linear_NEON,
- ScaleARGBRowDown2Linear_C,
- 2,
- 4,
- 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON,
- ScaleARGBRowDown2Box_NEON,
- ScaleARGBRowDown2Box_C,
- 2,
- 4,
- 7)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MSA
-SDANY(ScaleARGBRowDown2_Any_MSA,
- ScaleARGBRowDown2_MSA,
- ScaleARGBRowDown2_C,
- 2,
- 4,
- 3)
-SDANY(ScaleARGBRowDown2Linear_Any_MSA,
- ScaleARGBRowDown2Linear_MSA,
- ScaleARGBRowDown2Linear_C,
- 2,
- 4,
- 3)
-SDANY(ScaleARGBRowDown2Box_Any_MSA,
- ScaleARGBRowDown2Box_MSA,
- ScaleARGBRowDown2Box_C,
- 2,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MMI
-SDANY(ScaleARGBRowDown2_Any_MMI,
- ScaleARGBRowDown2_MMI,
- ScaleARGBRowDown2_C,
- 2,
- 4,
- 1)
-SDANY(ScaleARGBRowDown2Linear_Any_MMI,
- ScaleARGBRowDown2Linear_MMI,
- ScaleARGBRowDown2Linear_C,
- 2,
- 4,
- 1)
-SDANY(ScaleARGBRowDown2Box_Any_MMI,
- ScaleARGBRowDown2Box_MMI,
- ScaleARGBRowDown2Box_C,
- 2,
- 4,
- 1)
-#endif
-#undef SDANY
-
-// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
- uint8_t* dst_ptr, int dst_width) { \
- int r = dst_width & MASK; \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
- dst_ptr + n * BPP, r); \
- }
-
-#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2,
- ScaleARGBRowDownEven_SSE2,
- ScaleARGBRowDownEven_C,
- 4,
- 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
- ScaleARGBRowDownEvenBox_SSE2,
- ScaleARGBRowDownEvenBox_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON,
- ScaleARGBRowDownEven_NEON,
- ScaleARGBRowDownEven_C,
- 4,
- 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
- ScaleARGBRowDownEvenBox_NEON,
- ScaleARGBRowDownEvenBox_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
-SDAANY(ScaleARGBRowDownEven_Any_MSA,
- ScaleARGBRowDownEven_MSA,
- ScaleARGBRowDownEven_C,
- 4,
- 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
- ScaleARGBRowDownEvenBox_MSA,
- ScaleARGBRowDownEvenBox_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
-SDAANY(ScaleARGBRowDownEven_Any_MMI,
- ScaleARGBRowDownEven_MMI,
- ScaleARGBRowDownEven_C,
- 4,
- 1)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
- ScaleARGBRowDownEvenBox_MMI,
- ScaleARGBRowDownEvenBox_C,
- 4,
- 1)
-#endif
-
-#ifdef SASIMDONLY
-// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
-
-// Add rows box filter scale down. Using macro from row_any
-#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
- SIMD_ALIGNED(uint16_t dst_temp[32]); \
- SIMD_ALIGNED(uint8_t src_temp[32]); \
- memset(dst_temp, 0, 32 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, n); \
- } \
- memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
- memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
- ANY_SIMD(src_temp, dst_temp, MASK + 1); \
- memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
- }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MSA
-SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
-#endif
-#undef SAANY
-
-#else
-
-// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
- void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
- int n = src_width & ~MASK; \
- if (n > 0) { \
- SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
- } \
- SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
- }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MSA
-SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
-#endif
-#undef SAANY
-
-#endif // SASIMDONLY
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_dspr2.cc b/files/source/scale_dspr2.cc
deleted file mode 100644
index ddedcbf4..00000000
--- a/files/source/scale_dspr2.cc
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
- "beqz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- // TODO(fbarchard): Use odd pixels instead of even.
- "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1|
- "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9|
- "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17|
- "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t8, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t1, 8(%[dst]) \n"
- "sw $t2, 12(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0xf \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t0, 1(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 2 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
- : [dst_width] "r"(dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- const uint8* t = src_ptr + src_stride;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
- "bltz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 0(%[t]) \n" // |19|18|17|16|
- "lw $t5, 4(%[t]) \n" // |23|22|21|20|
- "lw $t6, 8(%[t]) \n" // |27|26|25|24|
- "lw $t7, 12(%[t]) \n" // |31|30|29|28|
- "addiu $t9, $t9, -1 \n"
- "srl $t8, $t0, 16 \n" // |X|X|3|2|
- "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
- "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
- "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
- "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
- "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
- "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
- "srl $t8, $t1, 16 \n" // |X|X|7|6|
- "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
- "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
- "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
- "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
- "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
- "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
- "srl $t8, $t2, 16 \n" // |X|X|11|10|
- "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
- "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
- "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
- "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
- "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
- "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
- "srl $t8, $t3, 16 \n" // |X|X|15|14|
- "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
- "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
- "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
- "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
- "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
- "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
- "addiu %[src_ptr], %[src_ptr], 16 \n"
- "addiu %[t], %[t], 16 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "sb $t1, 2(%[dst]) \n"
- "sb $t5, 3(%[dst]) \n"
- "sb $t2, 4(%[dst]) \n"
- "sb $t6, 5(%[dst]) \n"
- "sb $t3, 6(%[dst]) \n"
- "sb $t7, 7(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0x7 \n" // x = residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lwr $t1, 0(%[src_ptr]) \n"
- "lwl $t1, 3(%[src_ptr]) \n"
- "lwr $t2, 0(%[t]) \n"
- "lwl $t2, 3(%[t]) \n"
- "srl $t8, $t1, 16 \n"
- "ins $t1, $t2, 16, 16 \n"
- "ins $t2, $t8, 0, 16 \n"
- "raddu.w.qb $t1, $t1 \n"
- "raddu.w.qb $t2, $t2 \n"
- "shra_r.w $t1, $t1, 2 \n"
- "shra_r.w $t2, $t2, 2 \n"
- "sb $t1, 0(%[dst]) \n"
- "sb $t2, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -2 \n"
- "addiu %[t], %[t], 4 \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 2 \n"
-
- "3: \n"
- ".set pop \n"
-
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
- : [dst_width] "r"(dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown4_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n"
- "beqz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
- "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
- "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
- "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
- "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
- "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2|
- "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t1, 0(%[dst]) \n"
- "sw $t5, 4(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 7 \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t1, 2(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t1, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
- : [dst_width] "r"(dst_width)
- : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- intptr_t stride = src_stride;
- const uint8* s1 = src_ptr + stride;
- const uint8* s2 = s1 + stride;
- const uint8* s3 = s2 + stride;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 1 \n"
- "andi $t8, %[dst_width], 1 \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
- "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
- "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
- "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
- "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
- "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
- "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
- "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
- "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
- "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
- "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
- "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
- "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
- "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
- "add $t0, $t0, $t1 \n"
- "add $t1, $t2, $t3 \n"
- "add $t0, $t0, $t1 \n"
- "add $t4, $t4, $t5 \n"
- "add $t6, $t6, $t7 \n"
- "add $t4, $t4, $t6 \n"
- "shra_r.w $t0, $t0, 4 \n"
- "shra_r.w $t4, $t4, 4 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[s1], %[s1], 8 \n"
- "addiu %[s2], %[s2], 8 \n"
- "addiu %[s3], %[s3], 8 \n"
- "addiu $t9, $t9, -1 \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 2 \n"
- "beqz $t8, 2f \n"
- " nop \n"
-
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
- "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
- "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
- "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
- "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
- "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
- "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
- "add $t0, $t0, $t1 \n"
- "add $t1, $t2, $t3 \n"
- "add $t0, $t0, $t1 \n"
- "shra_r.w $t0, $t0, 4 \n"
- "sb $t0, 0(%[dst]) \n"
-
- "2: \n"
- ".set pop \n"
-
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
- [s3] "+r"(s3)
- : [dst_width] "r"(dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown34_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "1: \n"
- "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
- "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
- "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
- "addiu %[dst_width], %[dst_width], -24 \n"
- "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
- "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
- "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
- "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
- "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
- "prepend $t1, $t2, 8 \n" // |4|3|1|0|
- "prepend $t3, $t4, 24 \n" // |15|13|12|11|
- "prepend $t5, $t6, 8 \n" // |20|19|17|16|
- "prepend $t7, $t8, 24 \n" // |31|29|28|27|
- "sw $t1, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t3, 8(%[dst]) \n"
- "sw $t5, 12(%[dst]) \n"
- "sw $t9, 16(%[dst]) \n"
- "sw $t7, 20(%[dst]) \n"
- "bnez %[dst_width], 1b \n"
- " addiu %[dst], %[dst], 24 \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* d,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "repl.ph $t3, 3 \n" // 0x00030003
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
- "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
- "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
- "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
- "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
- "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
- "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t1, $t1 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "shra_r.w $t1, $t1, 1 \n"
- "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
- "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
- "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
- "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
- "addu.ph $t2, $t2, $t4 \n"
- "addu.ph $t6, $t6, $t5 \n"
- "sll $t5, $t0, 1 \n"
- "add $t0, $t5, $t0 \n"
- "shra_r.ph $t2, $t2, 2 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "shll.ph $t4, $t2, 1 \n"
- "addq.ph $t4, $t4, $t2 \n"
- "addu $t0, $t0, $t1 \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "shra_r.w $t0, $t0, 2 \n"
- "addu.ph $t6, $t6, $t4 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "srl $t1, $t6, 16 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "sb $t1, 0(%[d]) \n"
- "sb $t0, 1(%[d]) \n"
- "sb $t6, 2(%[d]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[d], %[d], 3 \n"
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
- [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* d,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "repl.ph $t2, 3 \n" // 0x00030003
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
- "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
- "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
- "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
- "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
- "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
- "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t1, $t1 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "shra_r.w $t1, $t1, 1 \n"
- "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
- "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
- "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
- "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
- "addu.ph $t4, $t4, $t3 \n"
- "addu.ph $t6, $t6, $t5 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "shra_r.ph $t4, $t4, 2 \n"
- "addu.ph $t6, $t6, $t4 \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "shra_r.ph $t6, $t6, 1 \n"
- "addu $t0, $t0, $t1 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "srl $t1, $t6, 16 \n"
- "sb $t1, 0(%[d]) \n"
- "sb $t0, 1(%[d]) \n"
- "sb $t6, 2(%[d]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[d], %[d], 3 \n"
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
- [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown38_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t6, $t6 \n" // |26|27|24|25|
- "srl $t0, $t0, 8 \n" // |X|2|3|0|
- "srl $t3, $t3, 16 \n" // |X|X|15|14|
- "srl $t5, $t5, 16 \n" // |X|X|23|22|
- "srl $t7, $t7, 16 \n" // |X|X|31|30|
- "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
- "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
- "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
- "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
- "prepend $t2, $t3, 24 \n" // |X|15|14|11|
- "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
- "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu %[dst_width], %[dst_width], -12 \n"
- "addiu $t8,%[dst_width], -12 \n"
- "sw $t1, 0(%[dst]) \n"
- "sw $t4, 4(%[dst]) \n"
- "sw $t6, 8(%[dst]) \n"
- "bgez $t8, 1b \n"
- " addiu %[dst], %[dst], 12 \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
-}
-
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr,
- int dst_width) {
- intptr_t stride = src_stride;
- const uint8* t = src_ptr + stride;
- const int c = 0x2AAA;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
- "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
- "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
- "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
- "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
- "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
- "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
- "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
- "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
- "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
- "srl $t4, $t4, 2 \n" // t4 / 4
- "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
- "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
- "addu $t6, $t5, $t6 \n"
- "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
- "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
- "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
- "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
- "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
- "addu $t0, $t0, $t2 \n"
- "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[t], %[t], 8 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "addiu %[dst_ptr], %[dst_ptr], 3 \n"
- "srl $t6, $t6, 16 \n"
- "srl $t0, $t0, 16 \n"
- "sb $t4, -1(%[dst_ptr]) \n"
- "sb $t6, -2(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " sb $t0, -3(%[dst_ptr]) \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
- [dst_width] "+r"(dst_width)
- : [c] "r"(c)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr,
- int dst_width) {
- intptr_t stride = src_stride;
- const uint8* s1 = src_ptr + stride;
- stride += stride;
- const uint8* s2 = src_ptr + stride;
- const int c1 = 0x1C71;
- const int c2 = 0x2AAA;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
- "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
- "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
- "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
- "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
- "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
- "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
- "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
- "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
- "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
- "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
- "raddu.w.qb $t8, $t8 \n" // R5+R4
- "addu $t7, $t7, $t8 \n"
- "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
- "raddu.w.qb $t8, $t8 \n" // R7 + R6
- "addu $t6, $t6, $t8 \n"
- "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
- "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
- "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
- "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
- "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
- "addu $t7, $t7, $t8 \n"
- "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
- "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
- "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
- "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t2, $t2 \n"
- "raddu.w.qb $t4, $t4 \n"
- "addu $t0, $t0, $t2 \n"
- "addu $t0, $t0, $t4 \n"
- "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[s1], %[s1], 8 \n"
- "addiu %[s2], %[s2], 8 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "addiu %[dst_ptr], %[dst_ptr], 3 \n"
- "srl $t6, $t6, 16 \n"
- "srl $t7, $t7, 16 \n"
- "srl $t0, $t0, 16 \n"
- "sb $t6, -1(%[dst_ptr]) \n"
- "sb $t7, -2(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " sb $t0, -3(%[dst_ptr]) \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
- [s2] "+r"(s2), [dst_width] "+r"(dst_width)
- : [c1] "r"(c1), [c2] "r"(c2)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
-}
-
-void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
- int x;
- for (x = 0; x < ((src_width - 1)); x += 8) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
- uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t5], 0(%[src_ptr]) \n"
- "lw %[tmp_t6], 4(%[src_ptr]) \n"
- "lw %[tmp_t1], 0(%[dst_ptr]) \n"
- "lw %[tmp_t2], 4(%[dst_ptr]) \n"
- "lw %[tmp_t3], 8(%[dst_ptr]) \n"
- "lw %[tmp_t4], 12(%[dst_ptr]) \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n"
- "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n"
- "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n"
- "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n"
- "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n"
- "sw %[tmp_t1], 0(%[dst_ptr]) \n"
- "sw %[tmp_t2], 4(%[dst_ptr]) \n"
- "sw %[tmp_t3], 8(%[dst_ptr]) \n"
- "sw %[tmp_t4], 12(%[dst_ptr]) \n"
- ".set pop \n"
- :
- [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
- [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
- : [dst_ptr] "r"(dst_ptr));
- src_ptr += 8;
- dst_ptr += 8;
- }
-
- if ((src_width)&7) {
- for (x = 0; x < ((src_width - 1) & 7); x += 1) {
- dst_ptr[0] += src_ptr[0];
- src_ptr += 1;
- dst_ptr += 1;
- }
- }
-}
-
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
deleted file mode 100644
index 90a49f30..00000000
--- a/files/source/scale_gcc.cc
+++ /dev/null
@@ -1,1374 +0,0 @@
-/*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-// Offsets for source bytes 0 to 9
-static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
- 128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
- 128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
- 128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 0 to 10
-static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
- 8, 9, 9, 10, 10, 11, 12, 13};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
- 10, 11, 12, 13, 13, 14, 14, 15};
-
-// Coefficients for source bytes 0 to 10
-static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
-
-// Coefficients for source bytes 10 to 21
-static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
-
-// Coefficients for source bytes 21 to 31
-static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
-
-// Coefficients for source bytes 21 to 31
-static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-
-static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
-
-static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
- 6, 8, 11, 14, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 0,1,2
-static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 3,4,5
-static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
- 6, 7, 12, 13, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x3 and 2x3
-static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
- 65536 / 9, 65536 / 6, 0, 0};
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
- 11, 128, 14, 128, 128, 128, 128, 128};
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
- 12, 128, 15, 128, 128, 128, 128, 128};
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
- 13, 128, 128, 128, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x2 and 2x2
-static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
- 65536 / 3, 65536 / 2, 0, 0};
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-
-void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- intptr_t stridex3;
- asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
- "lea 0x00(%4,%4,2),%3 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%4,2),%%xmm2 \n"
- "movdqu 0x10(%0,%4,2),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "=&r"(stridex3) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(src_stride * 3)) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif // HAS_SCALEROWDOWN4_AVX2
-
-void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kShuf0), // %0
- "m"(kShuf1), // %1
- "m"(kShuf2) // %2
- );
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
-
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-
-void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "m"(kShuf38a), // %3
- "m"(kShuf38b) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
- :
- : "m"(kShufAb0), // %0
- "m"(kShufAb1), // %1
- "m"(kShufAb2), // %2
- "m"(kScaleAb2) // %3
- );
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- :
- : "m"(kShufAc), // %0
- "m"(kShufAc3), // %1
- "m"(kScaleAc33) // %2
- );
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqu 0x00(%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
- asm volatile(
-
- "pxor %%xmm5,%%xmm5 \n"
-
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n" // src_ptr += 16
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
- asm volatile(
-
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm3 \n"
- "lea 0x20(%0),%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw (%1),%%ymm2,%%ymm0 \n"
- "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
- 0x4040, 0x4040, 0x4040, 0x4040};
-
-// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- intptr_t x0, x1, temp_pixel;
- asm volatile(
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
-
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- LABELALIGN
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movzwl 0x00(%1,%4,1),%k2 \n"
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
- // 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
- "mov %w2,(%0) \n"
- "lea 0x2(%0),%0 \n"
- "subl $0x2,%5 \n"
- "jge 2b \n"
-
- LABELALIGN
- "29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
- "mov %b2,(%0) \n"
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "=&a"(temp_pixel), // %2
- "=&r"(x0), // %3
- "=&r"(x1), // %4
-#if defined(__x86_64__)
- "+rm"(dst_width) // %5
-#else
- "+m"(dst_width) // %5
-#endif
- : "rm"(x), // %6
- "rm"(dx), // %7
-#if defined(__x86_64__)
- "x"(kFsub80), // %8
- "x"(kFadd40) // %9
-#else
- "m"(kFsub80), // %8
- "m"(kFadd40) // %9
-#endif
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- (void)x;
- (void)dx;
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
-
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
- intptr_t src_stepx_x12;
- (void)src_stride;
- asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
-
- LABELALIGN
- "1: \n"
- "movd (%0),%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%0,%1,2),%%xmm2 \n"
- "movd 0x00(%0,%4,1),%%xmm3 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width), // %3
- "=&r"(src_stepx_x12) // %4
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
- intptr_t src_stepx_x12;
- intptr_t row1 = (intptr_t)(src_stride);
- asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
- "lea 0x00(%0,%5,1),%5 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps 0x00(%0,%1,1),%%xmm0 \n"
- "movq 0x00(%0,%1,2),%%xmm1 \n"
- "movhps 0x00(%0,%4,1),%%xmm1 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps 0x00(%5,%1,1),%%xmm2 \n"
- "movq 0x00(%5,%1,2),%%xmm3 \n"
- "movhps 0x00(%5,%4,1),%%xmm3 \n"
- "lea 0x00(%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+rm"(dst_width), // %3
- "=&r"(src_stepx_x12), // %4
- "+r"(row1) // %5
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-void ScaleARGBCols_SSE2(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- intptr_t x0, x1;
- asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- LABELALIGN
- "40: \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%3,%0,4),%%xmm1 \n"
- "movd 0x00(%3,%1,4),%%xmm4 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
-
- "49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%2) \n"
- "lea 0x8(%2),%2 \n"
- "29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "99: \n"
- : "=&a"(x0), // %0
- "=&d"(x1), // %1
- "+r"(dst_argb), // %2
- "+r"(src_argb), // %3
- "+r"(dst_width) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- (void)x;
- (void)dx;
- asm volatile(
-
- LABELALIGN
- "1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
-
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
-}
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- intptr_t x0, x1;
- asm volatile(
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
- :
- : "m"(kShuffleColARGB), // %0
- "m"(kShuffleFractions) // %1
- );
-
- asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- LABELALIGN
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movhps 0x00(%1,%4,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
-
- LABELALIGN
- "29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
-
- LABELALIGN "99: \n" // clang-format error.
-
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+rm"(dst_width), // %2
- "=&r"(x0), // %3
- "=&r"(x1) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
- asm volatile(
- "cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
- : "+a"(num) // %0
- : "c"(div) // %1
- : "memory", "cc", "edx");
- return num;
-}
-
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_X86(int num, int div) {
- asm volatile(
- "cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
- : "+a"(num) // %0
- : "c"(div) // %1
- : "memory", "cc", "edx");
- return num;
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc
deleted file mode 100644
index 990463c2..00000000
--- a/files/source/scale_mmi.cc
+++ /dev/null
@@ -1,1113 +0,0 @@
-/*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h" // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-// CPU agnostic row functions
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1, dest;
- const uint64_t shift = 0x8ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "psrlh %[src0], %[src0], %[shift] \n\t"
-
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "psrlh %[src1], %[src1], %[shift] \n\t"
-
- "packushb %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [shift] "f"(shift)
- : "memory");
-}
-
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1;
- uint64_t dest, dest0, dest1;
-
- const uint64_t mask = 0x00ff00ff00ff00ffULL;
- const uint64_t shift = 0x8ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "and %[dest0], %[src0], %[mask] \n\t"
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "and %[dest1], %[src1], %[mask] \n\t"
- "packushb %[dest0], %[dest0], %[dest1] \n\t"
-
- "psrlh %[src0], %[src0], %[shift] \n\t"
- "psrlh %[src1], %[src1], %[shift] \n\t"
- "packushb %[dest1], %[src0], %[src1] \n\t"
-
- "pavgb %[dest], %[dest0], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
- [dest1] "=&f"(dest1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
- [shift] "f"(shift), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- const uint8_t* s = src_ptr;
- const uint8_t* t = src_ptr + src_stride;
-
- uint64_t s0, s1, t0, t1;
- uint64_t dest, dest0, dest1;
-
- const uint64_t ph = 0x0002000200020002ULL;
- const uint64_t mask = 0x00ff00ff00ff00ffULL;
- const uint64_t shift0 = 0x2ULL;
- const uint64_t shift1 = 0x8ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[s0], 0x00(%[s]) \n\t"
- "gsldlc1 %[s0], 0x07(%[s]) \n\t"
- "psrlh %[s1], %[s0], %[shift1] \n\t"
- "and %[s0], %[s0], %[mask] \n\t"
-
- "gsldrc1 %[t0], 0x00(%[t]) \n\t"
- "gsldlc1 %[t0], 0x07(%[t]) \n\t"
- "psrlh %[t1], %[t0], %[shift1] \n\t"
- "and %[t0], %[t0], %[mask] \n\t"
-
- "paddh %[dest0], %[s0], %[s1] \n\t"
- "paddh %[dest0], %[dest0], %[t0] \n\t"
- "paddh %[dest0], %[dest0], %[t1] \n\t"
- "paddh %[dest0], %[dest0], %[ph] \n\t"
- "psrlh %[dest0], %[dest0], %[shift0] \n\t"
-
- "gsldrc1 %[s0], 0x08(%[s]) \n\t"
- "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
- "psrlh %[s1], %[s0], %[shift1] \n\t"
- "and %[s0], %[s0], %[mask] \n\t"
-
- "gsldrc1 %[t0], 0x08(%[t]) \n\t"
- "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
- "psrlh %[t1], %[t0], %[shift1] \n\t"
- "and %[t0], %[t0], %[mask] \n\t"
-
- "paddh %[dest1], %[s0], %[s1] \n\t"
- "paddh %[dest1], %[dest1], %[t0] \n\t"
- "paddh %[dest1], %[dest1], %[t1] \n\t"
- "paddh %[dest1], %[dest1], %[ph] \n\t"
- "psrlh %[dest1], %[dest1], %[shift0] \n\t"
-
- "packushb %[dest], %[dest0], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[s], %[s], 0x10 \n\t"
- "daddiu %[t], %[t], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
- : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
- [mask] "f"(mask)
- : "memory");
-}
-
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
-
- const uint32_t* src = (const uint32_t*)(src_argb);
- uint32_t* dst = (uint32_t*)(dst_argb);
-
- uint64_t src0, src1, dest;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "punpckhwd %[dest], %[src0], %[src1] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1;
- uint64_t dest, dest_hi, dest_lo;
-
- __asm__ volatile(
- "1: \n\t"
- "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "lwc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "punpcklwd %[dest_lo], %[src0], %[src1] \n\t"
- "lwc1 %[src0], 0x04(%[src_ptr]) \n\t"
- "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t"
- "punpcklwd %[dest_hi], %[src0], %[src1] \n\t"
-
- "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- const uint8_t* s = src_argb;
- const uint8_t* t = src_argb + src_stride;
-
- uint64_t s0, s_hi, s_lo;
- uint64_t t0, t_hi, t_lo;
- uint64_t dest, dest_hi, dest_lo;
-
- const uint64_t mask = 0x0ULL;
- const uint64_t ph = 0x0002000200020002ULL;
- const uint64_t shfit = 0x2ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[s0], 0x00(%[s]) \n\t"
- "gsldlc1 %[s0], 0x07(%[s]) \n\t"
- "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
- "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
- "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t"
-
- "gsldrc1 %[t0], 0x00(%[t]) \n\t"
- "gsldlc1 %[t0], 0x07(%[t]) \n\t"
- "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
- "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
- "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t"
- "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t"
-
- "paddh %[dest_lo], %[dest_lo], %[ph] \n\t"
- "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t"
-
- "gsldrc1 %[s0], 0x08(%[s]) \n\t"
- "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
- "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
- "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
- "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t"
-
- "gsldrc1 %[t0], 0x08(%[t]) \n\t"
- "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
- "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
- "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
- "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t"
- "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t"
-
- "paddh %[dest_hi], %[dest_hi], %[ph] \n\t"
- "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[s], %[s], 0x10 \n\t"
- "daddiu %[t], %[t], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
- [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
- : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
- [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
- : "memory");
-}
-
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1, dest;
- const uint64_t shift = 0x10ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "psrlw %[src0], %[src0], %[shift] \n\t"
-
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "psrlw %[src1], %[src1], %[shift] \n\t"
-
- "packsswh %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [shift] "f"(shift)
- : "memory");
-}
-
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1;
- uint64_t dest, dest_hi, dest_lo;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "punpcklhw %[dest_lo], %[src0], %[src1] \n\t"
- "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
-
- "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t"
- "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t"
-
- "pavgh %[dest], %[src0], %[src1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- const uint16_t* s = src_ptr;
- const uint16_t* t = src_ptr + src_stride;
-
- uint64_t s0, s1, s_hi, s_lo;
- uint64_t t0, t1, t_hi, t_lo;
- uint64_t dest, dest0, dest1;
-
- const uint64_t ph = 0x0000000200000002ULL;
- const uint64_t mask = 0x0000ffff0000ffffULL;
- const uint64_t shift0 = 0x10ULL;
- const uint64_t shift1 = 0x2ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[s0], 0x00(%[s]) \n\t"
- "gsldlc1 %[s0], 0x07(%[s]) \n\t"
- "psrlw %[s1], %[s0], %[shift0] \n\t"
- "and %[s0], %[s0], %[mask] \n\t"
-
- "gsldrc1 %[t0], 0x00(%[t]) \n\t"
- "gsldlc1 %[t0], 0x07(%[t]) \n\t"
- "psrlw %[t1], %[t0], %[shift0] \n\t"
- "and %[t0], %[t0], %[mask] \n\t"
-
- "paddw %[dest0], %[s0], %[s1] \n\t"
- "paddw %[dest0], %[dest0], %[t0] \n\t"
- "paddw %[dest0], %[dest0], %[t1] \n\t"
- "paddw %[dest0], %[dest0], %[ph] \n\t"
- "psrlw %[dest0], %[dest0], %[shift1] \n\t"
-
- "gsldrc1 %[s0], 0x08(%[s]) \n\t"
- "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
- "psrlw %[s1], %[s0], %[shift0] \n\t"
- "and %[s0], %[s0], %[mask] \n\t"
-
- "gsldrc1 %[t0], 0x08(%[t]) \n\t"
- "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
- "psrlw %[t1], %[t0], %[shift0] \n\t"
- "and %[t0], %[t0], %[mask] \n\t"
-
- "paddw %[dest1], %[s0], %[s1] \n\t"
- "paddw %[dest1], %[dest1], %[t0] \n\t"
- "paddw %[dest1], %[dest1], %[t1] \n\t"
- "paddw %[dest1], %[dest1], %[ph] \n\t"
- "psrlw %[dest1], %[dest1], %[shift1] \n\t"
-
- "packsswh %[dest], %[dest0], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[s], %[s], 0x10 \n\t"
- "daddiu %[t], %[t], 0x10 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
- [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
- [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
- [dest] "=&f"(dest)
- : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
- [mask] "f"(mask)
- : "memory");
-}
-
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1;
- uint64_t dest, dest_hi, dest_lo;
-
- const uint64_t shift = 0x10ULL;
- const uint64_t mask = 0x000000ff000000ffULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "psrlw %[src0], %[src0], %[shift] \n\t"
- "and %[src0], %[src0], %[mask] \n\t"
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "psrlw %[src1], %[src1], %[shift] \n\t"
- "and %[src1], %[src1], %[mask] \n\t"
- "packsswh %[dest_lo], %[src0], %[src1] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
- "psrlw %[src0], %[src0], %[shift] \n\t"
- "and %[src0], %[src0], %[mask] \n\t"
- "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
- "psrlw %[src1], %[src1], %[shift] \n\t"
- "and %[src1], %[src1], %[mask] \n\t"
- "packsswh %[dest_hi], %[src0], %[src1] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [shift] "f"(shift), [mask] "f"(mask)
- : "memory");
-}
-
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1;
- uint64_t dest, dest_hi, dest_lo;
-
- const uint64_t mask = 0x0ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
- "punpckhhw %[dest_lo], %[src0], %[src1] \n\t"
- "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t"
-
- "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
- "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
- "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
- "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
- "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
- [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [mask] "f"(mask)
- : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
- "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
- "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
-
-#define DO_SCALEROWDOWN4BOX_LOOP(reg) \
- "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
- "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
- "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
- \
- "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
- DO_SCALEROWDOWN4BOX_PUNPCKADD() \
- \
- "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
- DO_SCALEROWDOWN4BOX_PUNPCKADD() \
- \
- "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
- DO_SCALEROWDOWN4BOX_PUNPCKADD() \
- \
- "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \
- "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \
- "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \
- "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \
- "paddh " #reg ", " #reg ", %[ph] \n\t" \
- "psrlh " #reg ", " #reg ", %[shift] \n\t" \
- \
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
- "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
- "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box */
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- const uint8_t* src0_ptr = src_ptr;
- const uint8_t* src1_ptr = src_ptr + src_stride;
- const uint8_t* src2_ptr = src_ptr + src_stride * 2;
- const uint8_t* src3_ptr = src_ptr + src_stride * 3;
-
- uint64_t src, src_hi, src_lo;
- uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
- const uint64_t mask0 = 0x0ULL;
- const uint64_t mask1 = 0x0001000100010001ULL;
- const uint64_t ph = 0x0008000800080008ULL;
- const uint64_t shift = 0x4ULL;
-
- __asm__ volatile(
- "1: \n\t"
-
- DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
- DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
- DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
- DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
-
- "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
- "packsswh %[dest_hi], %[dest2], %[dest3] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
- [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
- [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
- [ph] "f"(ph), [mask1] "f"(mask1)
- : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
- "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
- "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
- "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
- "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
-
-#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \
- "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
- "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
- "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
- \
- "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
- DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
- \
- "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
- DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
- \
- "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
- DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
- \
- "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \
- "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \
- "paddw %[dest], %[dest_hi], %[dest] \n\t" \
- "paddw %[dest], %[dest], %[ph] \n\t" \
- "psraw %[dest], %[dest], %[shift] \n\t" \
- "and " #reg ", %[dest], %[mask1] \n\t" \
- \
- "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
- "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
- "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- const uint16_t* src0_ptr = src_ptr;
- const uint16_t* src1_ptr = src_ptr + src_stride;
- const uint16_t* src2_ptr = src_ptr + src_stride * 2;
- const uint16_t* src3_ptr = src_ptr + src_stride * 3;
-
- uint64_t src, src_hi, src_lo;
- uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
- const uint64_t mask0 = 0x0ULL;
- const uint64_t mask1 = 0x00000000ffffffffULL;
- const uint64_t ph = 0x0000000800000008ULL;
- const uint64_t shift = 0x04ULL;
-
- __asm__ volatile(
- "1: \n\t"
-
- DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
- DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
- DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
- DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
- "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t"
- "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t"
-
- "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
- [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
- [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
- [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
- [ph] "f"(ph), [mask1] "f"(mask1)
- : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- uint64_t src, dest;
-
- (void)x;
- (void)dx;
-
- __asm__ volatile(
- "1: \n\t"
- "lwc1 %[src], 0x00(%[src_ptr]) \n\t"
-
- "punpcklbh %[dest], %[src], %[src] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
- const uint16_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- uint64_t src, dest;
-
- (void)x;
- (void)dx;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
-
- "punpcklhw %[dest], %[src], %[src] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "punpckhhw %[dest], %[src], %[src] \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src] "=&f"(src), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
- uint64_t src, src_hi, src_lo, dest0, dest1;
- const uint64_t mask = 0x0ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src], %[mask] \n\t"
- "punpckhbh %[src_hi], %[src], %[mask] \n\t"
-
- "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
- "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
- "paddush %[dest0], %[dest0], %[src_lo] \n\t"
- "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
- "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
- "paddush %[dest1], %[dest1], %[src_hi] \n\t"
-
- "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
- "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
- [src_lo] "=&f"(src_lo), [src] "=&f"(src)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
- [mask] "f"(mask)
- : "memory");
-}
-
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
- uint32_t* dst_ptr,
- int src_width) {
- uint64_t src, src_hi, src_lo, dest0, dest1;
- const uint64_t mask = 0x0ULL;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "punpcklhw %[src_lo], %[src], %[mask] \n\t"
- "punpckhhw %[src_hi], %[src], %[mask] \n\t"
-
- "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
- "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
- "paddw %[dest0], %[dest0], %[src_lo] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
-
- "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
- "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
- "paddw %[dest1], %[dest1], %[src_hi] \n\t"
- "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
- [src_lo] "=&f"(src_lo), [src] "=&f"(src)
- : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
- [mask] "f"(mask)
- : "memory");
-}
-
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
-
- uint64_t src0, src1, dest;
-
- __asm__ volatile(
- "1: \n\t"
- "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
- "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
- "lwc1 %[src1], 0x00(%[src_ptr]) \n\t"
- "punpcklwd %[dest], %[src0], %[src1] \n\t"
-
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
- [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
- : "memory");
-}
-
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- const uint8_t* src0_ptr = src_argb;
- const uint8_t* src1_ptr = src_argb + src_stride;
-
- uint64_t src0, src1, src_hi, src_lo;
- uint64_t dest, dest_hi, dest_lo, dest0, dest1;
-
- const uint64_t mask = 0x0ULL;
- const uint64_t ph = 0x0002000200020002ULL;
- const uint64_t shift = 0x2ULL;
-
- __asm__ volatile(
- "1: \n\t"
-
- "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
- "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
- "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
-
- "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
- "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
- "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
- "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
- "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
- "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t"
- "paddh %[dest0], %[dest0], %[ph] \n\t"
- "psrlh %[dest0], %[dest0], %[shift] \n\t"
-
- "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
- "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
-
- "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
- "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
- "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
- "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
-
- "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
- "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
- "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
- "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
- "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
- "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
- "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t"
- "paddh %[dest1], %[dest1], %[ph] \n\t"
- "psrlh %[dest1], %[dest1], %[shift] \n\t"
-
- "packushb %[dest], %[dest0], %[dest1] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
- "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
- "daddi %[width], %[width], -0x02 \n\t"
- "bnez %[width], 1b \n\t"
- : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
- [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
- [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
- [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
- [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
- [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
- [ph] "f"(ph)
- : "memory");
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- const uint32_t* src = (const uint32_t*)(src_argb);
- uint32_t* dst = (uint32_t*)(dst_argb);
-
- const uint32_t* src_tmp;
-
- uint64_t dest, offset;
-
- const uint64_t shift0 = 16;
- const uint64_t shift1 = 2;
-
- __asm__ volatile(
- "1: \n\t"
- "srav %[offset], %[x], %[shift0] \n\t"
- "sllv %[offset], %[offset], %[shift1] \n\t"
- "dadd %[src_tmp], %[src_ptr], %[offset] \n\t"
- "lwc1 %[dest], 0x00(%[src_tmp]) \n\t"
- "swc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- "dadd %[x], %[x], %[dx] \n\t"
-
- "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t"
- "daddi %[width], %[width], -0x01 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
- : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
- [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
- : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- uint64_t src, dest0, dest1;
- (void)x;
- (void)dx;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
- "punpcklwd %[dest0], %[src], %[src] \n\t"
- "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
- "punpckhwd %[dest1], %[src], %[src] \n\t"
- "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x04 \n\t"
- "bnez %[width], 1b \n\t"
- : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
- : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
- : "memory");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVBaseTest.TestFixedDiv */
-int FixedDiv_MIPS(int num, int div) {
- int quotient = 0;
- const int shift = 16;
-
- asm(
- "dsll %[num], %[num], %[shift] \n\t"
- "ddiv %[num], %[div] \t\n"
- "mflo %[quo] \t\n"
- : [quo] "+&r"(quotient)
- : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
-
- return quotient;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
-int FixedDiv1_MIPS(int num, int div) {
- int quotient = 0;
- const int shift = 16;
- const int val1 = 1;
- const int64_t val11 = 0x00010001ULL;
-
- asm(
- "dsll %[num], %[num], %[shift] \n\t"
- "dsub %[num], %[num], %[val11] \n\t"
- "dsub %[div], %[div], %[val1] \n\t"
- "ddiv %[num], %[div] \t\n"
- "mflo %[quo] \t\n"
- : [quo] "+&r"(quotient)
- : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
- [shift] "r"(shift));
-
- return quotient;
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- const uint16_t* src2_ptr = src_ptr + src_stride;
-
- uint64_t src0, src1;
- uint64_t dest, dest04, dest15, dest26, dest37;
- uint64_t tmp0, tmp1, tmp2, tmp3;
-
- const uint64_t mask0 = 0x0003000900030009ULL;
- const uint64_t mask1 = 0x0001000300010003ULL;
- const uint64_t mask2 = 0x0009000300090003ULL;
- const uint64_t mask3 = 0x0003000100030001ULL;
- const uint64_t ph = 0x0000000800000008ULL;
- const uint64_t shift = 4;
-
- __asm__ volatile(
- "1: \n\t"
- "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t"
- "pmaddhw %[dest04], %[src0], %[mask0] \n\t"
- "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t"
- "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t"
- "pmaddhw %[dest], %[src1], %[mask1] \n\t"
- "paddw %[dest04], %[dest04], %[dest] \n\t"
- "paddw %[dest04], %[dest04], %[ph] \n\t"
- "psrlw %[dest04], %[dest04], %[shift] \n\t"
-
- "pmaddhw %[dest15], %[src0], %[mask2] \n\t"
- "pmaddhw %[dest], %[src1], %[mask3] \n\t"
- "paddw %[dest15], %[dest15], %[dest] \n\t"
- "paddw %[dest15], %[dest15], %[ph] \n\t"
- "psrlw %[dest15], %[dest15], %[shift] \n\t"
-
- "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t"
- "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t"
- "pmaddhw %[dest26], %[src0], %[mask0] \n\t"
- "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t"
- "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t"
- "pmaddhw %[dest], %[src1], %[mask1] \n\t"
- "paddw %[dest26], %[dest26], %[dest] \n\t"
- "paddw %[dest26], %[dest26], %[ph] \n\t"
- "psrlw %[dest26], %[dest26], %[shift] \n\t"
-
- "pmaddhw %[dest37], %[src0], %[mask2] \n\t"
- "pmaddhw %[dest], %[src1], %[mask3] \n\t"
- "paddw %[dest37], %[dest37], %[dest] \n\t"
- "paddw %[dest37], %[dest37], %[ph] \n\t"
- "psrlw %[dest37], %[dest37], %[shift] \n\t"
-
- /* tmp0 = ( 00 04 02 06 ) */
- "packsswh %[tmp0], %[dest04], %[dest26] \n\t"
- /* tmp1 = ( 01 05 03 07 ) */
- "packsswh %[tmp1], %[dest15], %[dest37] \n\t"
-
- /* tmp2 = ( 00 01 04 05 )*/
- "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t"
- /* tmp3 = ( 02 03 06 07 )*/
- "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t"
-
- /* ( 00 01 02 03 ) */
- "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t"
- "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
-
- /* ( 04 05 06 07 ) */
- "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t"
- "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
- "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
-
- "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
- "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t"
- "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
- "daddi %[width], %[width], -0x08 \n\t"
- "bnez %[width], 1b \n\t"
- : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
- [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
- [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
- [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
- : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
- [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
- [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
- : "memory");
-}
-
-// clang-format on
-
-#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
deleted file mode 100644
index 366b155b..00000000
--- a/files/source/scale_neon.cc
+++ /dev/null
@@ -1,958 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
- !defined(__aarch64__)
-
-// NEON downscalers with interpolation.
-// Provided by Fritz Koenig
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- // load even pixels into q0, odd into q1
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
- );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
- );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- asm volatile(
- // change the stride to row 2 pointer
- "add %1, %0 \n"
- "1: \n"
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
- // row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
- // pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1", "memory", "cc");
-}
-
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- const uint8_t* src_ptr1 = src_ptr + src_stride;
- const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
- const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
- asm volatile(
- "1: \n"
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [%3]! \n"
- "vld1.8 {q2}, [%4]! \n"
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_ptr1), // %3
- "+r"(src_ptr2), // %4
- "+r"(src_ptr3) // %5
- :
- : "q0", "q1", "q2", "q3", "memory", "cc");
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "d0", "d1", "d2", "d3", "memory", "cc");
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
-
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
-
- // 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
-
- // (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
-
- "vst3.8 {d0, d1, d2}, [%1]! \n"
-
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
- "cc");
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
- // average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
-
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
-}
-
-#define HAS_SCALEROWDOWN38_NEON
-static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
- 22, 24, 27, 30, 0, 0, 0, 0};
-static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
- 18, 6, 14, 19, 0, 0, 0, 0};
-static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12};
-static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18};
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "vld1.8 {q3}, [%3] \n"
- "1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.8 {d4}, [%1]! \n"
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
-
- asm volatile(
- "vld1.16 {q13}, [%5] \n"
- "vld1.8 {q14}, [%6] \n"
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
-
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
-
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
-
- // combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
-
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
-
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q15 \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
-
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride), // %3
- "+r"(src_ptr1) // %4
- : "r"(&kMult38_Div6), // %5
- "r"(&kShuf38_2), // %6
- "r"(&kMult38_Div9) // %7
- : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
- "cc");
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
-
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
-
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
-
- // combine source lines
- "vadd.u16 q1, q3 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
-
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
-
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q13 \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
-
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
-}
-
-// Add a row of bytes to a row of shorts. Used for box filter.
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-void ScaleAddRow_NEON(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
- asm volatile(
- "1: \n"
- "vld1.16 {q1, q2}, [%1] \n" // load accumulator
- "vld1.8 {q0}, [%0]! \n" // load 16 bytes
- "vaddw.u8 q2, q2, d1 \n" // add
- "vaddw.u8 q1, q1, d0 \n"
- "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
- "subs %2, %2, #16 \n" // 16 processed per loop
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2" // Clobber List
- );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5 \n" \
- "add %3, %3, %4 \n" \
- "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
-
-// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
-// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- int dx_offset[4] = {0, 1, 2, 3};
- int* tmp = dx_offset;
- const uint8_t* src_tmp = src_ptr;
- asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q3, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
- // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q1, q1, q0 \n"
- // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "vadd.s32 q2, q1, q3 \n"
- "vshl.i32 q0, q3, #1 \n" // 8 * dx
- "1: \n"
- LOAD2_DATA8_LANE(0)
- LOAD2_DATA8_LANE(1)
- LOAD2_DATA8_LANE(2)
- LOAD2_DATA8_LANE(3)
- LOAD2_DATA8_LANE(4)
- LOAD2_DATA8_LANE(5)
- LOAD2_DATA8_LANE(6)
- LOAD2_DATA8_LANE(7)
- "vmov q10, q1 \n"
- "vmov q11, q2 \n"
- "vuzp.16 q10, q11 \n"
- "vmovl.u8 q8, d6 \n"
- "vmovl.u8 q9, d7 \n"
- "vsubl.s16 q11, d18, d16 \n"
- "vsubl.s16 q12, d19, d17 \n"
- "vmovl.u16 q13, d20 \n"
- "vmovl.u16 q10, d21 \n"
- "vmul.s32 q11, q11, q13 \n"
- "vmul.s32 q12, q12, q10 \n"
- "vrshrn.s32 d18, q11, #16 \n"
- "vrshrn.s32 d19, q12, #16 \n"
- "vadd.s16 q8, q8, q9 \n"
- "vmovn.s16 d6, q8 \n"
-
- "vst1.8 {d6}, [%0]! \n" // store pixels
- "vadd.s32 q1, q1, q0 \n"
- "vadd.s32 q2, q2, q0 \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(x), // %3
- "+r"(dx), // %4
- "+r"(tmp), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13"
- );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
- // General purpose row blend.
- "1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
-
- // Blend 25 / 75.
- "25: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
-
- // Blend 75 / 25.
- "75: \n"
- "vld1.8 {q1}, [%1]! \n"
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
-
- "99: \n"
- "vst1.8 {d1[7]}, [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
- :
- : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
-}
-
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vmov q2, q1 \n" // load next 8 ARGB
- "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
-// 4a: 3e04 subs r6, #4
-// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
-// 50: ef64 21f4 vorr q9, q10, q10
-// 54: f942 038d vst2.32 {d16-d19}, [r2]!
-// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
-
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vrhadd.u8 q1, q2, q3 \n" // rounding half add
- "vst2.32 {q0, q1}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- asm volatile(
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "mov r12, %3, lsl #2 \n"
- "1: \n"
- "vld1.32 {d0[0]}, [%0], r12 \n"
- "vld1.32 {d0[1]}, [%0], r12 \n"
- "vld1.32 {d1[0]}, [%0], r12 \n"
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"(src_stepx) // %3
- : "memory", "cc", "r12", "q0");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- asm volatile(
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
- "1: \n"
- "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
- "vld1.8 {d1}, [%1], r12 \n"
- "vld1.8 {d2}, [%0], r12 \n"
- "vld1.8 {d3}, [%1], r12 \n"
- "vld1.8 {d4}, [%0], r12 \n"
- "vld1.8 {d5}, [%1], r12 \n"
- "vld1.8 {d6}, [%0], r12 \n"
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"(src_stepx) // %4
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- "vld1.32 {" #dn "[" #n "]}, [%6] \n"
-
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- int tmp;
- const uint8_t* src_tmp = src_argb;
- asm volatile(
- "1: \n"
- // clang-format off
- LOAD1_DATA32_LANE(d0, 0)
- LOAD1_DATA32_LANE(d0, 1)
- LOAD1_DATA32_LANE(d1, 0)
- LOAD1_DATA32_LANE(d1, 1)
- LOAD1_DATA32_LANE(d2, 0)
- LOAD1_DATA32_LANE(d2, 1)
- LOAD1_DATA32_LANE(d3, 0)
- LOAD1_DATA32_LANE(d3, 1)
- // clang-format on
- "vst1.32 {q0, q1}, [%0]! \n" // store pixels
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width), // %2
- "+r"(x), // %3
- "+r"(dx), // %4
- "=&r"(tmp), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "q0", "q1");
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
-
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- int dx_offset[4] = {0, 1, 2, 3};
- int* tmp = dx_offset;
- const uint8_t* src_tmp = src_argb;
- asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q9, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
- "vmov.i8 q3, #0x7f \n" // 0x7F
- "vmov.i16 q15, #0x7f \n" // 0x7F
- // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q8, q1, q0 \n"
- "1: \n"
- // d0, d1: a
- // d2, d3: b
- LOAD2_DATA32_LANE(d0, d2, 0)
- LOAD2_DATA32_LANE(d0, d2, 1)
- LOAD2_DATA32_LANE(d1, d3, 0)
- LOAD2_DATA32_LANE(d1, d3, 1)
- "vshrn.i32 d22, q8, #9 \n"
- "vand.16 d22, d22, d30 \n"
- "vdup.8 d24, d22[0] \n"
- "vdup.8 d25, d22[2] \n"
- "vdup.8 d26, d22[4] \n"
- "vdup.8 d27, d22[6] \n"
- "vext.8 d4, d24, d25, #4 \n"
- "vext.8 d5, d26, d27, #4 \n" // f
- "veor.8 q10, q2, q3 \n" // 0x7f ^ f
- "vmull.u8 q11, d0, d20 \n"
- "vmull.u8 q12, d1, d21 \n"
- "vmull.u8 q13, d2, d4 \n"
- "vmull.u8 q14, d3, d5 \n"
- "vadd.i16 q11, q11, q13 \n"
- "vadd.i16 q12, q12, q14 \n"
- "vshrn.i16 d0, q11, #7 \n"
- "vshrn.i16 d1, q12, #7 \n"
-
- "vst1.32 {d0, d1}, [%0]! \n" // store pixels
- "vadd.s32 q8, q8, q9 \n"
- "subs %2, %2, #4 \n" // 4 processed per loop
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width), // %2
- "+r"(x), // %3
- "+r"(dx), // %4
- "+r"(tmp), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-#undef LOAD2_DATA32_LANE
-
-#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
deleted file mode 100644
index 0a7b80ce..00000000
--- a/files/source/scale_neon64.cc
+++ /dev/null
@@ -1,1052 +0,0 @@
-/*
- * Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- // load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1" // Clobber List
- );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- // load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1" // Clobber List
- );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- asm volatile(
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "st1 {v2.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- const uint8_t* src_ptr1 = src_ptr + src_stride;
- const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
- const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- "ld1 {v1.16b}, [%2], #16 \n"
- "ld1 {v2.16b}, [%3], #16 \n"
- "ld1 {v3.16b}, [%4], #16 \n"
- "subs %w5, %w5, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v0.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n"
- "uadalp v0.8h, v3.16b \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- "st1 {v0.s}[0], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_ptr1), // %2
- "+r"(src_ptr2), // %3
- "+r"(src_ptr3), // %4
- "+r"(dst_width) // %5
- :
- : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #24 \n"
- "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
-
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
-
- // 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
-
- // (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
-
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
-
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
- "v19", "v20", "memory", "cc");
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
- // average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
-
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
-}
-
-static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
- 22, 24, 27, 30, 0, 0, 0, 0};
-static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
- 34, 6, 22, 35, 0, 0, 0, 0};
-static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12};
-static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18};
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
- "st1 {v2.8b}, [%1], #8 \n"
- "st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
- ptrdiff_t tmp_src_stride = src_stride;
-
- asm volatile(
- "ld1 {v29.8h}, [%5] \n"
- "ld1 {v30.16b}, [%6] \n"
- "ld1 {v31.8h}, [%7] \n"
- "add %2, %2, %0 \n"
- "1: \n"
-
- // 00 40 01 41 02 42 03 43
- // 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63
- // 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
- "subs %w4, %w4, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // 00 10 01 11 02 12 03 13
- // 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
-
- // 20 30 21 31 22 32 23 33
- // 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
-
- // 00+10 01+11 02+12 03+13
- // 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
-
- // 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
-
- // combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
-
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
-
- // combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
-
- // xx 20 xx 21 xx 22 xx 23
- // xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
-
- // 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
-
- // Align for table lookup, vtbl requires registers to be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_src_stride), // %2
- "+r"(src_ptr1), // %3
- "+r"(dst_width) // %4
- : "r"(&kMult38_Div6), // %5
- "r"(&kShuf38_2), // %6
- "r"(&kMult38_Div9) // %7
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
- "memory", "cc");
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- // TODO(fbarchard): use src_stride directly for clang 3.5+.
- ptrdiff_t tmp_src_stride = src_stride;
- asm volatile(
- "ld1 {v30.8h}, [%4] \n"
- "ld1 {v31.16b}, [%5] \n"
- "add %2, %2, %0 \n"
- "1: \n"
-
- // 00 40 01 41 02 42 03 43
- // 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63
- // 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "subs %w3, %w3, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // 00 10 01 11 02 12 03 13
- // 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
-
- // 20 30 21 31 22 32 23 33
- // 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
-
- // 00+10 01+11 02+12 03+13
- // 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
-
- // 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
-
- // combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
-
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
- // combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
-
- // xx 20 xx 21 xx 22 xx 23
- // xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
-
- // 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
-
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_src_stride), // %2
- "+r"(dst_width) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
- "v19", "v30", "v31", "memory", "cc");
-}
-
-// Add a row of bytes to a row of shorts. Used for box filter.
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-void ScaleAddRow_NEON(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
- "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
- "uaddw v1.8h, v1.8h, v0.8b \n"
- "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "v0", "v1", "v2" // Clobber List
- );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5 \n" \
- "add %3, %3, %4 \n" \
- "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
-
-// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
-// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- int dx_offset[4] = {0, 1, 2, 3};
- int* tmp = dx_offset;
- const uint8_t* src_tmp = src_ptr;
- int64_t x64 = (int64_t)x; // NOLINT
- int64_t dx64 = (int64_t)dx; // NOLINT
- asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v3.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
- // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v1.4s, v1.4s, v0.4s \n"
- // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "add v2.4s, v1.4s, v3.4s \n"
- "shl v0.4s, v3.4s, #1 \n" // 8 * dx
- "1: \n"
- LOAD2_DATA8_LANE(0)
- LOAD2_DATA8_LANE(1)
- LOAD2_DATA8_LANE(2)
- LOAD2_DATA8_LANE(3)
- LOAD2_DATA8_LANE(4)
- LOAD2_DATA8_LANE(5)
- LOAD2_DATA8_LANE(6)
- LOAD2_DATA8_LANE(7)
- "mov v6.16b, v1.16b \n"
- "mov v7.16b, v2.16b \n"
- "uzp1 v6.8h, v6.8h, v7.8h \n"
- "ushll v4.8h, v4.8b, #0 \n"
- "ushll v5.8h, v5.8b, #0 \n"
- "ssubl v16.4s, v5.4h, v4.4h \n"
- "ssubl2 v17.4s, v5.8h, v4.8h \n"
- "ushll v7.4s, v6.4h, #0 \n"
- "ushll2 v6.4s, v6.8h, #0 \n"
- "mul v16.4s, v16.4s, v7.4s \n"
- "mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
- "add v4.8h, v4.8h, v6.8h \n"
- "xtn v4.8b, v4.8h \n"
-
- "st1 {v4.8b}, [%0], #8 \n" // store pixels
- "add v1.4s, v1.4s, v0.4s \n"
- "add v2.4s, v2.4s, v0.4s \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(x64), // %3
- "+r"(dx64), // %4
- "+r"(tmp), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "v0", "v1", "v2", "v3",
- "v4", "v5", "v6", "v7", "v16", "v17"
- );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- int y_fraction = 256 - source_y_fraction;
- asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %w4, #64 \n"
- "b.eq 75f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "cmp %w4, #192 \n"
- "b.eq 25f \n"
-
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
- // General purpose row blend.
- "1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
-
- // Blend 25 / 75.
- "25: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 25b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
-
- // Blend 75 / 25.
- "75: \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 75b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
-
- "99: \n"
- "st1 {v0.b}[15], [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction), // %4
- "+r"(y_fraction) // %5
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
-}
-
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "mov v2.16b, v3.16b \n"
- "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
-
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "urhadd v1.16b, v2.16b, v3.16b \n"
- "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
- asm volatile(
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- (void)src_stride;
- asm volatile(
- "1: \n"
- "ld1 {v0.s}[0], [%0], %3 \n"
- "ld1 {v0.s}[1], [%0], %3 \n"
- "ld1 {v0.s}[2], [%0], %3 \n"
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"((int64_t)(src_stepx * 4)) // %3
- : "memory", "cc", "v0");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-// TODO(Yang Zhang): Might be worth another optimization pass in future.
-// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- asm volatile(
- "add %1, %1, %0 \n"
- "1: \n"
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
- "ld1 {v1.8b}, [%1], %4 \n"
- "ld1 {v2.8b}, [%0], %4 \n"
- "ld1 {v3.8b}, [%1], %4 \n"
- "ld1 {v4.8b}, [%0], %4 \n"
- "ld1 {v5.8b}, [%1], %4 \n"
- "ld1 {v6.8b}, [%0], %4 \n"
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %w3, %w3, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"((int64_t)(src_stepx * 4)) // %4
- : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- "ld1 {" #vn ".s}[" #n "], [%6] \n"
-
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- const uint8_t* src_tmp = src_argb;
- int64_t x64 = (int64_t)x; // NOLINT
- int64_t dx64 = (int64_t)dx; // NOLINT
- int64_t tmp64;
- asm volatile(
- "1: \n"
- // clang-format off
- LOAD1_DATA32_LANE(v0, 0)
- LOAD1_DATA32_LANE(v0, 1)
- LOAD1_DATA32_LANE(v0, 2)
- LOAD1_DATA32_LANE(v0, 3)
- LOAD1_DATA32_LANE(v1, 0)
- LOAD1_DATA32_LANE(v1, 1)
- LOAD1_DATA32_LANE(v1, 2)
- LOAD1_DATA32_LANE(v1, 3)
- // clang-format on
- "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width), // %2
- "+r"(x64), // %3
- "+r"(dx64), // %4
- "=&r"(tmp64), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "v0", "v1");
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
-
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- int dx_offset[4] = {0, 1, 2, 3};
- int* tmp = dx_offset;
- const uint8_t* src_tmp = src_argb;
- int64_t x64 = (int64_t)x; // NOLINT
- int64_t dx64 = (int64_t)dx; // NOLINT
- asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v6.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
- "movi v3.16b, #0x7f \n" // 0x7F
- "movi v4.8h, #0x7f \n" // 0x7F
- // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v5.4s, v1.4s, v0.4s \n"
- "1: \n"
- // d0, d1: a
- // d2, d3: b
- LOAD2_DATA32_LANE(v0, v1, 0)
- LOAD2_DATA32_LANE(v0, v1, 1)
- LOAD2_DATA32_LANE(v0, v1, 2)
- LOAD2_DATA32_LANE(v0, v1, 3)
- "shrn v2.4h, v5.4s, #9 \n"
- "and v2.8b, v2.8b, v4.8b \n"
- "dup v16.8b, v2.b[0] \n"
- "dup v17.8b, v2.b[2] \n"
- "dup v18.8b, v2.b[4] \n"
- "dup v19.8b, v2.b[6] \n"
- "ext v2.8b, v16.8b, v17.8b, #4 \n"
- "ext v17.8b, v18.8b, v19.8b, #4 \n"
- "ins v2.d[1], v17.d[0] \n" // f
- "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
- "umull v16.8h, v0.8b, v7.8b \n"
- "umull2 v17.8h, v0.16b, v7.16b \n"
- "umull v18.8h, v1.8b, v2.8b \n"
- "umull2 v19.8h, v1.16b, v2.16b \n"
- "add v16.8h, v16.8h, v18.8h \n"
- "add v17.8h, v17.8h, v19.8h \n"
- "shrn v0.8b, v16.8h, #7 \n"
- "shrn2 v0.16b, v17.8h, #7 \n"
-
- "st1 {v0.4s}, [%0], #16 \n" // store pixels
- "add v5.4s, v5.4s, v6.4s \n"
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width), // %2
- "+r"(x64), // %3
- "+r"(dx64), // %4
- "+r"(tmp), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
- "v6", "v7", "v16", "v17", "v18", "v19"
- );
-}
-
-#undef LOAD2_DATA32_LANE
-
-// Read 16x2 average down and write 8x1.
-void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- asm volatile(
- // change the stride to row 2 pointer
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
- "1: \n"
- "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #8 \n" // 8 processed per loop
- "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
- "uaddlp v1.4s, v1.8h \n"
- "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
- "uadalp v1.4s, v3.8h \n"
- "rshrn v0.4h, v0.4s, #2 \n" // round and pack
- "rshrn2 v0.8h, v1.4s, #2 \n"
- "st1 {v0.8h}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// Actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- asm volatile(
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
- "movi v0.8h, #9 \n" // constants
- "movi v1.4s, #3 \n"
-
- "1: \n"
- "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
- "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
- "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
- "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
- "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
- "umull v16.4s, v3.4h, v0.4h \n"
- "umull2 v7.4s, v3.8h, v0.8h \n"
- "umull v18.4s, v4.4h, v0.4h \n"
- "umull2 v17.4s, v4.8h, v0.8h \n"
- "uaddw v16.4s, v16.4s, v6.4h \n"
- "uaddl2 v19.4s, v6.8h, v3.8h \n"
- "uaddl v3.4s, v6.4h, v3.4h \n"
- "uaddw2 v6.4s, v7.4s, v6.8h \n"
- "uaddl2 v7.4s, v5.8h, v4.8h \n"
- "uaddl v4.4s, v5.4h, v4.4h \n"
- "uaddw v18.4s, v18.4s, v5.4h \n"
- "mla v16.4s, v4.4s, v1.4s \n"
- "mla v18.4s, v3.4s, v1.4s \n"
- "mla v6.4s, v7.4s, v1.4s \n"
- "uaddw2 v4.4s, v17.4s, v5.8h \n"
- "uqrshrn v16.4h, v16.4s, #4 \n"
- "mla v4.4s, v19.4s, v1.4s \n"
- "uqrshrn2 v16.8h, v6.4s, #4 \n"
- "uqrshrn v17.4h, v18.4s, #4 \n"
- "uqrshrn2 v17.8h, v4.4s, #4 \n"
- "st2 {v16.8h-v17.8h}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- : "r"(2LL), // %4
- "r"(14LL) // %5
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
- "v19" // Clobber List
- );
-}
-
-#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/sync_chromium.py b/files/sync_chromium.py
deleted file mode 100755
index 4e51b6bd..00000000
--- a/files/sync_chromium.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Script to download a Chromium checkout into the workspace.
-
-The script downloads a full Chromium Git clone and its DEPS.
-
-The following environment variable can be used to alter the behavior:
-* CHROMIUM_NO_HISTORY - If set to 1, a Git checkout with no history will be
- downloaded. This is consumes less bandwidth and disk space but is known to be
- slower in general if you have a high-speed connection.
-
-After a successful sync has completed, a .last_sync_chromium file is written to
-the chromium directory. While it exists, no more gclient sync operations will be
-performed until the --target-revision changes or the SCRIPT_VERSION constant is
-incremented. The file can be removed manually to force a new sync.
-"""
-
-import argparse
-import os
-import subprocess
-import sys
-
-# Bump this whenever the algorithm changes and you need bots/devs to re-sync,
-# ignoring the .last_sync_chromium file
-SCRIPT_VERSION = 4
-
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHROMIUM_NO_HISTORY = 'CHROMIUM_NO_HISTORY'
-
-def _parse_gclient_dict():
- gclient_dict = {}
- try:
- main_gclient = os.path.join(os.path.dirname(ROOT_DIR), '.gclient')
- with open(main_gclient, 'rb') as deps_content:
- exec(deps_content, gclient_dict)
- except Exception as e:
- print >> sys.stderr, 'error while parsing .gclient:', e
- return gclient_dict
-
-
-def get_cache_dir():
- return _parse_gclient_dict().get('cache_dir')
-
-
-def get_target_os_list():
- return ','.join(_parse_gclient_dict().get('target_os', []))
-
-
-def main():
- CR_DIR = os.path.join(ROOT_DIR, 'chromium')
-
- p = argparse.ArgumentParser()
- p.add_argument('--target-revision', required=True,
- help='The target chromium git revision [REQUIRED]')
- p.add_argument('--chromium-dir', default=CR_DIR,
- help=('The path to the chromium directory to sync '
- '(default: %(default)r)'))
- opts = p.parse_args()
- opts.chromium_dir = os.path.abspath(opts.chromium_dir)
-
- target_os_list = get_target_os_list()
-
- # Do a quick check to see if we were successful last time to make runhooks
- # sooper fast.
- flag_file = os.path.join(opts.chromium_dir, '.last_sync_chromium')
- flag_file_content = '\n'.join([
- str(SCRIPT_VERSION),
- opts.target_revision,
- repr(target_os_list),
- ])
- if (os.path.exists(os.path.join(opts.chromium_dir, 'src')) and
- os.path.exists(flag_file)):
- with open(flag_file, 'r') as f:
- if f.read() == flag_file_content:
- print 'Chromium already up to date: ', opts.target_revision
- return 0
- os.unlink(flag_file)
-
- env = os.environ.copy()
-
- # Avoid downloading NaCl toolchain as part of the Chromium hooks.
- env['GYP_CHROMIUM_NO_ACTION'] = '1'
- gclient_cmd = 'gclient.bat' if sys.platform.startswith('win') else 'gclient'
- args = [
- gclient_cmd, 'sync', '--force', '--revision', 'src@'+opts.target_revision
- ]
-
- if os.environ.get('CHROME_HEADLESS') == '1':
- # Running on a buildbot.
- args.append('-vvv')
-
- if sys.platform.startswith('win'):
- cache_path = os.path.join(os.path.splitdrive(ROOT_DIR)[0] + os.path.sep,
- 'b', 'git-cache')
- else:
- cache_path = '/b/git-cache'
- else:
- # Support developers setting the cache_dir in .gclient.
- cache_path = get_cache_dir()
-
- # Allow for users with poor internet connections to download a Git clone
- # without history (saves several gigs but is generally slower and doesn't work
- # with the Git cache).
- if os.environ.get(CHROMIUM_NO_HISTORY) == '1':
- if cache_path:
- print >> sys.stderr, (
- 'You cannot use "no-history" mode for syncing Chrome (i.e. set the '
- '%s environment variable to 1) when you have cache_dir configured in '
- 'your .gclient.' % CHROMIUM_NO_HISTORY)
- return 1
- args.append('--no-history')
- gclient_entries_file = os.path.join(opts.chromium_dir, '.gclient_entries')
- else:
- # Write a temporary .gclient file that has the cache_dir variable added.
- gclientfile = os.path.join(opts.chromium_dir, '.gclient')
- with open(gclientfile, 'rb') as spec:
- spec = spec.read().splitlines()
- spec[-1] = 'cache_dir = %r' % (cache_path,)
- with open(gclientfile + '.tmp', 'wb') as f:
- f.write('\n'.join(spec))
-
- args += [
- '--gclientfile', '.gclient.tmp',
- '--delete_unversioned_trees', '--reset', '--upstream'
- ]
- gclient_entries_file = os.path.join(opts.chromium_dir,
- '.gclient.tmp_entries')
-
- # To avoid gclient sync problems when DEPS entries have been removed we must
- # wipe the gclient's entries file that contains cached URLs for all DEPS.
- if os.path.exists(gclient_entries_file):
- os.unlink(gclient_entries_file)
-
- if target_os_list:
- args += ['--deps=' + target_os_list]
-
- print 'Running "%s" in %s' % (' '.join(args), opts.chromium_dir)
- ret = subprocess.call(args, cwd=opts.chromium_dir, env=env)
- if ret == 0:
- with open(flag_file, 'wb') as f:
- f.write(flag_file_content)
-
- return ret
-
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/files/third_party/gflags/BUILD.gn b/files/third_party/gflags/BUILD.gn
deleted file mode 100644
index af41b7ec..00000000
--- a/files/third_party/gflags/BUILD.gn
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-if (is_win) {
- gflags_gen_arch_root = "gen/win"
-} else {
- gflags_gen_arch_root = "gen/posix"
-}
-
-config("gflags_config") {
- include_dirs = [
- "$gflags_gen_arch_root/include", # For configured files.
- "src/src", # For everything else.
- ]
-
- defines = [
- # These macros exist so flags and symbols are properly exported when
- # building DLLs. Since we don't build DLLs, we need to disable them.
- "GFLAGS_DLL_DECL=",
- "GFLAGS_DLL_DECLARE_FLAG=",
- "GFLAGS_DLL_DEFINE_FLAG=",
- ]
-
- # GN orders flags on a target before flags from configs. The default config
- # adds -Wall, and this flag have to be after -Wall -- so they need to
- # come from a config and can't be on the target directly.
- if (is_clang) {
- cflags = [ "-Wno-unused-local-typedef" ]
- }
-}
-
-source_set("gflags") {
- cflags = []
- sources = [
- "src/src/gflags.cc",
- "src/src/gflags_completions.cc",
- "src/src/gflags_reporting.cc",
- ]
- if (is_win) {
- sources += [ "src/src/windows_port.cc" ]
-
- cflags += [
- "/wd4005", # WIN32_LEAN_AND_MEAN.
- "/wd4267", # Conversion from size_t to "type".
- ]
- }
-
- include_dirs = [
- "$gflags_gen_arch_root/include/gflags", # For configured files.
- "$gflags_gen_arch_root/include/private", # For config.h
- ]
-
- public_configs = [ ":gflags_config" ]
-
- configs -= [ "//build/config/compiler:chromium_code" ]
- configs += [ "//build/config/compiler:no_chromium_code" ]
-
- if (is_win) {
- configs -= [ "//build/config/win:unicode" ]
- }
-
- if (is_clang) {
- # TODO(andrew): Look into fixing this warning upstream:
- # http://code.google.com/p/webrtc/issues/detail?id=760
- configs -= [ "//build/config/clang:extra_warnings" ]
- cflags += [ "-Wno-microsoft-include" ]
- }
-}
diff --git a/files/third_party/gflags/LICENSE b/files/third_party/gflags/LICENSE
deleted file mode 100644
index d15b0c24..00000000
--- a/files/third_party/gflags/LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2006, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
- * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/files/third_party/gflags/README.libyuv b/files/third_party/gflags/README.libyuv
deleted file mode 100644
index 5b3bc2db..00000000
--- a/files/third_party/gflags/README.libyuv
+++ /dev/null
@@ -1,28 +0,0 @@
-URL: https://github.com/gflags/gflags
-Version: 2.1.2
-License: New BSD
-License File: LICENSE
-
-Description:
-The gflags package contains a library that implements commandline
-flags processing. As such it's a replacement for getopt(). It has
-increased flexibility, including built-in support for C++ types like
-string, and the ability to define flags in the source file in which
-they're used.
-
-Local Modifications: None
-
-
-How to update platform configuration files:
-The gen/ directory contains pre-generated configuration header files.
-Historically, all operating systems and architectures have generated
-similar configurations except for Windows. This is why there's only
-posix and win directories below gen/.
-When rolling gflags to a newer version, it's a good idea to check if
-new configuration files needs to be generated as well.
-Do this by running ./configure in the newly checked out version of
-gflags. Then diff the generated files with the ones below gen/.
-If you notice a diff, update the files with the updated ones.
-If you suspect platform dependend changes other than Windows, you'll
-have to checkout gflags on the other platforms as well and run
-./configure there too.
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags.h b/files/third_party/gflags/gen/posix/include/gflags/gflags.h
deleted file mode 100644
index 0db38f5c..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags.h
+++ /dev/null
@@ -1,573 +0,0 @@
-// Copyright (c) 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// or defines a command line flag or wants to parse command line flags
-// or print a program usage message (which will include information about
-// flags). Executive summary, in the form of an example foo.cc file:
-//
-// #include "foo.h" // foo.h has a line "DECLARE_int32(start);"
-// #include "validators.h" // hypothetical file defining ValidateIsFile()
-//
-// DEFINE_int32(end, 1000, "The last record to read");
-//
-// DEFINE_string(filename, "my_file.txt", "The file to read");
-// // Crash if the specified file does not exist.
-// static bool dummy = RegisterFlagValidator(&FLAGS_filename,
-// &ValidateIsFile);
-//
-// DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
-//
-// void MyFunc() {
-// if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
-// }
-//
-// Then, at the command-line:
-// ./foo --noverbose --start=5 --end=100
-//
-// For more details, see
-// doc/gflags.html
-//
-// --- A note about thread-safety:
-//
-// We describe many functions in this routine as being thread-hostile,
-// thread-compatible, or thread-safe. Here are the meanings we use:
-//
-// thread-safe: it is safe for multiple threads to call this routine
-// (or, when referring to a class, methods of this class)
-// concurrently.
-// thread-hostile: it is not safe for multiple threads to call this
-// routine (or methods of this class) concurrently. In gflags,
-// most thread-hostile routines are intended to be called early in,
-// or even before, main() -- that is, before threads are spawned.
-// thread-compatible: it is safe for multiple threads to read from
-// this variable (when applied to variables), or to call const
-// methods of this class (when applied to classes), as long as no
-// other thread is writing to the variable or calling non-const
-// methods of this class.
-
-#ifndef GFLAGS_GFLAGS_H_
-#define GFLAGS_GFLAGS_H_
-
-#include <string>
-#include <vector>
-
-#include "gflags_declare.h" // IWYU pragma: export
-
-
-// We always want to export variables defined in user code
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-# ifdef _MSC_VER
-# define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
-# else
-# define GFLAGS_DLL_DEFINE_FLAG
-# endif
-#endif
-
-
-namespace GFLAGS_NAMESPACE {
-
-
-// --------------------------------------------------------------------
-// To actually define a flag in a file, use DEFINE_bool,
-// DEFINE_string, etc. at the bottom of this file. You may also find
-// it useful to register a validator with the flag. This ensures that
-// when the flag is parsed from the commandline, or is later set via
-// SetCommandLineOption, we call the validation function. It is _not_
-// called when you assign the value to the flag directly using the = operator.
-//
-// The validation function should return true if the flag value is valid, and
-// false otherwise. If the function returns false for the new setting of the
-// flag, the flag will retain its current value. If it returns false for the
-// default value, ParseCommandLineFlags() will die.
-//
-// This function is safe to call at global construct time (as in the
-// example below).
-//
-// Example use:
-// static bool ValidatePort(const char* flagname, int32 value) {
-// if (value > 0 && value < 32768) // value is ok
-// return true;
-// printf("Invalid value for --%s: %d\n", flagname, (int)value);
-// return false;
-// }
-// DEFINE_int32(port, 0, "What port to listen on");
-// static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
-
-// Returns true if successfully registered, false if not (because the
-// first argument doesn't point to a command-line flag, or because a
-// validator is already registered for this flag).
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool* flag, bool (*validate_fn)(const char*, bool));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32* flag, bool (*validate_fn)(const char*, int32));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64* flag, bool (*validate_fn)(const char*, int64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64* flag, bool (*validate_fn)(const char*, uint64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double* flag, bool (*validate_fn)(const char*, double));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
-
-// Convenience macro for the registration of a flag validator
-#define DEFINE_validator(name, validator) \
- static const bool name##_validator_registered = \
- GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
-
-
-// --------------------------------------------------------------------
-// These methods are the best way to get access to info about the
-// list of commandline flags. Note that these routines are pretty slow.
-// GetAllFlags: mostly-complete info about the list, sorted by file.
-// ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
-// ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
-//
-// In addition to accessing flags, you can also access argv[0] (the program
-// name) and argv (the entire commandline), which we sock away a copy of.
-// These variables are static, so you should only set them once.
-//
-// No need to export this data only structure from DLL, avoiding VS warning 4251.
-struct CommandLineFlagInfo {
- std::string name; // the name of the flag
- std::string type; // the type of the flag: int32, etc
- std::string description; // the "help text" associated with the flag
- std::string current_value; // the current value, as a string
- std::string default_value; // the default value, as a string
- std::string filename; // 'cleaned' version of filename holding the flag
- bool has_validator_fn; // true if RegisterFlagValidator called on this flag
- bool is_default; // true if the flag has the default value and
- // has not been set explicitly from the cmdline
- // or via SetCommandLineOption
- const void* flag_ptr; // pointer to the flag's current value (i.e. FLAGS_foo)
-};
-
-// Using this inside of a validator is a recipe for a deadlock.
-// TODO(user) Fix locking when validators are running, to make it safe to
-// call validators during ParseAllFlags.
-// Also make sure then to uncomment the corresponding unit test in
-// gflags_unittest.sh
-extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
-// These two are actually defined in gflags_reporting.cc.
-extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0); // what --help does
-extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
-
-// Create a descriptive string for a flag.
-// Goes to some trouble to make pretty line breaks.
-extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
-
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
-
-// The following functions are thread-safe as long as SetArgv() is
-// only called before any threads start.
-extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
-extern GFLAGS_DLL_DECL const char* GetArgv(); // all of argv as a string
-extern GFLAGS_DLL_DECL const char* GetArgv0(); // only argv0
-extern GFLAGS_DLL_DECL uint32 GetArgvSum(); // simple checksum of argv
-extern GFLAGS_DLL_DECL const char* ProgramInvocationName(); // argv0, or "UNKNOWN" if not set
-extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName(); // basename(argv0)
-
-// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* ProgramUsage(); // string set by SetUsageMessage()
-
-// VersionString() is thread-safe as long as SetVersionString() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* VersionString(); // string set by SetVersionString()
-
-
-
-// --------------------------------------------------------------------
-// Normally you access commandline flags by just saying "if (FLAGS_foo)"
-// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
-// commonly, via the DEFINE_foo macro). But if you need a bit more
-// control, we have programmatic ways to get/set the flags as well.
-// These programmatic ways to access flags are thread-safe, but direct
-// access is only thread-compatible.
-
-// Return true iff the flagname was found.
-// OUTPUT is set to the flag's value, or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
-
-// Return true iff the flagname was found. OUTPUT is set to the flag's
-// CommandLineFlagInfo or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
-
-// Return the CommandLineFlagInfo of the flagname. exit() if name not found.
-// Example usage, to check if a flag's value is currently the default value:
-// if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
-extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
-
-enum GFLAGS_DLL_DECL FlagSettingMode {
- // update the flag's value (can call this multiple times).
- SET_FLAGS_VALUE,
- // update the flag's value, but *only if* it has not yet been updated
- // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
- SET_FLAG_IF_DEFAULT,
- // set the flag's default value to this. If the flag has not yet updated
- // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
- // change the flag's current value to the new default value as well.
- SET_FLAGS_DEFAULT
-};
-
-// Set a particular flag ("command line option"). Returns a string
-// describing the new value that the option has been set to. The
-// return value API is not well-specified, so basically just depend on
-// it to be empty if the setting failed for some reason -- the name is
-// not a valid flag name, or the value is not a valid value -- and
-// non-empty else.
-
-// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
-extern GFLAGS_DLL_DECL std::string SetCommandLineOption (const char* name, const char* value);
-extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
-
-
-// --------------------------------------------------------------------
-// Saves the states (value, default value, whether the user has set
-// the flag, registered validators, etc) of all flags, and restores
-// them when the FlagSaver is destroyed. This is very useful in
-// tests, say, when you want to let your tests change the flags, but
-// make sure that they get reverted to the original states when your
-// test is complete.
-//
-// Example usage:
-// void TestFoo() {
-// FlagSaver s1;
-// FLAG_foo = false;
-// FLAG_bar = "some value";
-//
-// // test happens here. You can return at any time
-// // without worrying about restoring the FLAG values.
-// }
-//
-// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
-// the work is done in the constructor and destructor, so in the standard
-// usage example above, the compiler would complain that it's an
-// unused variable.
-//
-// This class is thread-safe. However, its destructor writes to
-// exactly the set of flags that have changed value during its
-// lifetime, so concurrent _direct_ access to those flags
-// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
-
-class GFLAGS_DLL_DECL FlagSaver {
- public:
- FlagSaver();
- ~FlagSaver();
-
- private:
- class FlagSaverImpl* impl_; // we use pimpl here to keep API steady
-
- FlagSaver(const FlagSaver&); // no copying!
- void operator=(const FlagSaver&);
-}__attribute((unused));
-
-// --------------------------------------------------------------------
-// Some deprecated or hopefully-soon-to-be-deprecated functions.
-
-// This is often used for logging. TODO(csilvers): figure out a better way
-extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
-// Usually where this is used, a FlagSaver should be used instead.
-extern GFLAGS_DLL_DECL
-bool ReadFlagsFromString(const std::string& flagfilecontents,
- const char* prog_name,
- bool errors_are_fatal); // uses SET_FLAGS_VALUE
-
-// These let you manually implement --flagfile functionality.
-// DEPRECATED.
-extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
-extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal); // uses SET_FLAGS_VALUE
-
-
-// --------------------------------------------------------------------
-// Useful routines for initializing flags from the environment.
-// In each case, if 'varname' does not exist in the environment
-// return defval. If 'varname' does exist but is not valid
-// (e.g., not a number for an int32 flag), abort with an error.
-// Otherwise, return the value. NOTE: for booleans, for true use
-// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
-
-extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
-extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
-extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
-extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
-extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
-extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
-
-
-// --------------------------------------------------------------------
-// The next two functions parse gflags from main():
-
-// Set the "usage" message for this program. For example:
-// string usage("This program does nothing. Sample usage:\n");
-// usage += argv[0] + " <uselessarg1> <uselessarg2>";
-// SetUsageMessage(usage);
-// Do not include commandline flags in the usage: we do that for you!
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
-
-// Sets the version string, which is emitted with --version.
-// For instance: SetVersionString("1.3");
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
-
-
-// Looks for flags in argv and parses them. Rearranges argv to put
-// flags first, or removes them entirely if remove_flags is true.
-// If a flag is defined more than once in the command line or flag
-// file, the last definition is used. Returns the index (into argv)
-// of the first non-flag argument.
-// See top-of-file for more details on this function.
-#ifndef SWIG // In swig, use ParseCommandLineFlagsScript() instead.
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
-#endif
-
-
-// Calls to ParseCommandLineNonHelpFlags and then to
-// HandleCommandLineHelpFlags can be used instead of a call to
-// ParseCommandLineFlags during initialization, in order to allow for
-// changing default values for some FLAGS (via
-// e.g. SetCommandLineOptionWithMode calls) between the time of
-// command line parsing and the time of dumping help information for
-// the flags as a result of command line parsing. If a flag is
-// defined more than once in the command line or flag file, the last
-// definition is used. Returns the index (into argv) of the first
-// non-flag argument. (If remove_flags is true, will always return 1.)
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
-
-// This is actually defined in gflags_reporting.cc.
-// This function is misnamed (it also handles --version, etc.), but
-// it's too late to change that now. :-(
-extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags(); // in gflags_reporting.cc
-
-// Allow command line reparsing. Disables the error normally
-// generated when an unknown flag is found, since it may be found in a
-// later parse. Thread-hostile; meant to be called before any threads
-// are spawned.
-extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
-
-// Reparse the flags that have not yet been recognized. Only flags
-// registered since the last parse will be recognized. Any flag value
-// must be provided as part of the argument using "=", not as a
-// separate command line argument that follows the flag argument.
-// Intended for handling flags from dynamically loaded libraries,
-// since their flags are not registered until they are loaded.
-extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
-
-// Clean up memory allocated by flags. This is only needed to reduce
-// the quantity of "potentially leaked" reports emitted by memory
-// debugging tools such as valgrind. It is not required for normal
-// operation, or for the google perftools heap-checker. It must only
-// be called when the process is about to exit, and all threads that
-// might access flags are quiescent. Referencing flags after this is
-// called will have unexpected consequences. This is not safe to run
-// when multiple threads might be running: the function is
-// thread-hostile.
-extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
-
-
-// --------------------------------------------------------------------
-// Now come the command line flag declaration/definition macros that
-// will actually be used. They're kind of hairy. A major reason
-// for this is initialization: we want people to be able to access
-// variables in global constructors and have that not crash, even if
-// their global constructor runs before the global constructor here.
-// (Obviously, we can't guarantee the flags will have the correct
-// default value in that case, but at least accessing them is safe.)
-// The only way to do that is have flags point to a static buffer.
-// So we make one, using a union to ensure proper alignment, and
-// then use placement-new to actually set up the flag with the
-// correct default value. In the same vein, we have to worry about
-// flag access in global destructors, so FlagRegisterer has to be
-// careful never to destroy the flag-values it constructs.
-//
-// Note that when we define a flag variable FLAGS_<name>, we also
-// preemptively define a junk variable, FLAGS_no<name>. This is to
-// cause a link-time error if someone tries to define 2 flags with
-// names like "logging" and "nologging". We do this because a bool
-// flag FLAG can be set from the command line to true with a "-FLAG"
-// argument, and to false with a "-noFLAG" argument, and so this can
-// potentially avert confusion.
-//
-// We also put flags into their own namespace. It is purposefully
-// named in an opaque way that people should have trouble typing
-// directly. The idea is that DEFINE puts the flag in the weird
-// namespace, and DECLARE imports the flag from there into the current
-// namespace. The net result is to force people to use DECLARE to get
-// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
-// or some such instead. We want this so we can put extra
-// functionality (like sanity-checking) in DECLARE if we want, and
-// make sure it is picked up everywhere.
-//
-// We also put the type of the variable in the namespace, so that
-// people can't DECLARE_int32 something that they DEFINE_bool'd
-// elsewhere.
-
-class GFLAGS_DLL_DECL FlagRegisterer {
- public:
- FlagRegisterer(const char* name, const char* type,
- const char* help, const char* filename,
- void* current_storage, void* defvalue_storage);
-};
-
-// If your application #defines STRIP_FLAG_HELP to a non-zero value
-// before #including this file, we remove the help message from the
-// binary file. This can reduce the size of the resulting binary
-// somewhat, and may also be useful for security reasons.
-
-extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
-
-
-} // namespace GFLAGS_NAMESPACE
-
-
-#ifndef SWIG // In swig, ignore the main flag declarations
-
-#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
-// Need this construct to avoid the 'defined but not used' warning.
-#define MAYBE_STRIPPED_HELP(txt) \
- (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
-#else
-#define MAYBE_STRIPPED_HELP(txt) txt
-#endif
-
-// Each command-line flag has two variables associated with it: one
-// with the current value, and one with the default value. However,
-// we have a third variable, which is where value is assigned; it's a
-// constant. This guarantees that FLAG_##value is initialized at
-// static initialization time (e.g. before program-start) rather than
-// than global construction time (which is after program-start but
-// before main), at least when 'value' is a compile-time constant. We
-// use a small trick for the "default value" variable, and call it
-// FLAGS_no<name>. This serves the second purpose of assuring a
-// compile error if someone tries to define a flag named no<name>
-// which is illegal (--foo and --nofoo both affect the "foo" flag).
-#define DEFINE_VARIABLE(type, shorttype, name, value, help) \
- namespace fL##shorttype { \
- static const type FLAGS_nono##name = value; \
- /* We always want to export defined variables, dll or no */ \
- GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name; \
- type FLAGS_no##name = FLAGS_nono##name; \
- static GFLAGS_NAMESPACE::FlagRegisterer o_##name( \
- #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__, \
- &FLAGS_##name, &FLAGS_no##name); \
- } \
- using fL##shorttype::FLAGS_##name
-
-// For DEFINE_bool, we want to do the extra check that the passed-in
-// value is actually a bool, and not a string or something that can be
-// coerced to a bool. These declarations (no definition needed!) will
-// help us do that, and never evaluate From, which is important.
-// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
-// that the compiler have different sizes for bool & double. Since
-// this is not guaranteed by the standard, we check it with a
-// COMPILE_ASSERT.
-namespace fLB {
-struct CompileAssert {};
-typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
- (sizeof(double) != sizeof(bool)) ? 1 : -1];
-template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
-GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
-} // namespace fLB
-
-// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
-// are in a separate include, gflags_declare.h, for reducing
-// the physical transitive size for DECLARE use.
-#define DEFINE_bool(name, val, txt) \
- namespace fLB { \
- typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[ \
- (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
- } \
- DEFINE_VARIABLE(bool, B, name, val, txt)
-
-#define DEFINE_int32(name, val, txt) \
- DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
- name, val, txt)
-
-#define DEFINE_int64(name, val, txt) \
- DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
- name, val, txt)
-
-#define DEFINE_uint64(name,val, txt) \
- DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
- name, val, txt)
-
-#define DEFINE_double(name, val, txt) \
- DEFINE_VARIABLE(double, D, name, val, txt)
-
-// Strings are trickier, because they're not a POD, so we can't
-// construct them at static-initialization time (instead they get
-// constructed at global-constructor time, which is much later). To
-// try to avoid crashes in that case, we use a char buffer to store
-// the string, which we can static-initialize, and then placement-new
-// into it later. It's not perfect, but the best we can do.
-
-namespace fLS {
-
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
- const char *value) {
- return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
- const clstring &value) {
- return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
- int value);
-} // namespace fLS
-
-// We need to define a var named FLAGS_no##name so people don't define
-// --string and --nostring. And we need a temporary place to put val
-// so we don't have to evaluate it twice. Two great needs that go
-// great together!
-// The weird 'using' + 'extern' inside the fLS namespace is to work around
-// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10. See
-// http://code.google.com/p/google-gflags/issues/detail?id=20
-#define DEFINE_string(name, val, txt) \
- namespace fLS { \
- using ::fLS::clstring; \
- static union { void* align; char s[sizeof(clstring)]; } s_##name[2]; \
- clstring* const FLAGS_no##name = ::fLS:: \
- dont_pass0toDEFINE_string(s_##name[0].s, \
- val); \
- static GFLAGS_NAMESPACE::FlagRegisterer o_##name( \
- #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__, \
- s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name)); \
- extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name; \
- using fLS::FLAGS_##name; \
- clstring& FLAGS_##name = *FLAGS_no##name; \
- } \
- using fLS::FLAGS_##name
-
-#endif // SWIG
-
-
-// Import gflags library symbols into alternative/deprecated namespace(s)
-#include "gflags_gflags.h"
-
-
-#endif // GFLAGS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
deleted file mode 100644
index f951c1e0..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ---
-
-//
-// Implement helpful bash-style command line flag completions
-//
-// ** Functional API:
-// HandleCommandLineCompletions() should be called early during
-// program startup, but after command line flag code has been
-// initialized, such as the beginning of HandleCommandLineHelpFlags().
-// It checks the value of the flag --tab_completion_word. If this
-// flag is empty, nothing happens here. If it contains a string,
-// however, then HandleCommandLineCompletions() will hijack the
-// process, attempting to identify the intention behind this
-// completion. Regardless of the outcome of this deduction, the
-// process will be terminated, similar to --helpshort flag
-// handling.
-//
-// ** Overview of Bash completions:
-// Bash can be told to programatically determine completions for the
-// current 'cursor word'. It does this by (in this case) invoking a
-// command with some additional arguments identifying the command
-// being executed, the word being completed, and the previous word
-// (if any). Bash then expects a sequence of output lines to be
-// printed to stdout. If these lines all contain a common prefix
-// longer than the cursor word, bash will replace the cursor word
-// with that common prefix, and display nothing. If there isn't such
-// a common prefix, bash will display the lines in pages using 'more'.
-//
-// ** Strategy taken for command line completions:
-// If we can deduce either the exact flag intended, or a common flag
-// prefix, we'll output exactly that. Otherwise, if information
-// must be displayed to the user, we'll take the opportunity to add
-// some helpful information beyond just the flag name (specifically,
-// we'll include the default flag value and as much of the flag's
-// description as can fit on a single terminal line width, as specified
-// by the flag --tab_completion_columns). Furthermore, we'll try to
-// make bash order the output such that the most useful or relevent
-// flags are the most likely to be shown at the top.
-//
-// ** Additional features:
-// To assist in finding that one really useful flag, substring matching
-// was implemented. Before pressing a <TAB> to get completion for the
-// current word, you can append one or more '?' to the flag to do
-// substring matching. Here's the semantics:
-// --foo<TAB> Show me all flags with names prefixed by 'foo'
-// --foo?<TAB> Show me all flags with 'foo' somewhere in the name
-// --foo??<TAB> Same as prior case, but also search in module
-// definition path for 'foo'
-// --foo???<TAB> Same as prior case, but also search in flag
-// descriptions for 'foo'
-// Finally, we'll trim the output to a relatively small number of
-// flags to keep bash quiet about the verbosity of output. If one
-// really wanted to see all possible matches, appending a '+' to the
-// search word will force the exhaustive list of matches to be printed.
-//
-// ** How to have bash accept completions from a binary:
-// Bash requires that it be informed about each command that programmatic
-// completion should be enabled for. Example addition to a .bashrc
-// file would be (your path to gflags_completions.sh file may differ):
-
-/*
-$ complete -o bashdefault -o default -o nospace -C \
- '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
- time env binary_name another_binary [...]
-*/
-
-// This would allow the following to work:
-// $ /path/to/binary_name --vmodule<TAB>
-// Or:
-// $ ./bin/path/another_binary --gfs_u<TAB>
-// (etc)
-//
-// Sadly, it appears that bash gives no easy way to force this behavior for
-// all commands. That's where the "time" in the above example comes in.
-// If you haven't specifically added a command to the list of completion
-// supported commands, you can still get completions by prefixing the
-// entire command with "env".
-// $ env /some/brand/new/binary --vmod<TAB>
-// Assuming that "binary" is a newly compiled binary, this should still
-// produce the expected completion output.
-
-
-#ifndef GFLAGS_COMPLETIONS_H_
-#define GFLAGS_COMPLETIONS_H_
-
-namespace google {
-
-extern void HandleCommandLineCompletions(void);
-
-}
-
-#endif // GFLAGS_COMPLETIONS_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
deleted file mode 100644
index 935a20e7..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 1999, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-//
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// command line flag.
-
-#ifndef GFLAGS_DECLARE_H_
-#define GFLAGS_DECLARE_H_
-
-
-// ---------------------------------------------------------------------------
-// Namespace of gflags library symbols.
-#define GFLAGS_NAMESPACE google
-
-// ---------------------------------------------------------------------------
-// Windows DLL import/export.
-
-// We always want to import the symbols of the gflags library
-#ifndef GFLAGS_DLL_DECL
-# if 0 && defined(_MSC_VER)
-# define GFLAGS_DLL_DECL __declspec(dllimport)
-# else
-# define GFLAGS_DLL_DECL
-# endif
-#endif
-
-// We always want to import variables declared in user code
-#ifndef GFLAGS_DLL_DECLARE_FLAG
-# ifdef _MSC_VER
-# define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
-# else
-# define GFLAGS_DLL_DECLARE_FLAG
-# endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Flag types
-#include <string>
-#if 1
-# include <stdint.h> // the normal place uint32_t is defined
-#elif 1
-# include <sys/types.h> // the normal place u_int32_t is defined
-#elif 1
-# include <inttypes.h> // a third place for uint32_t or u_int32_t
-#endif
-
-namespace GFLAGS_NAMESPACE {
-
-#if 1 // C99
-typedef int32_t int32;
-typedef uint32_t uint32;
-typedef int64_t int64;
-typedef uint64_t uint64;
-#elif 0 // BSD
-typedef int32_t int32;
-typedef u_int32_t uint32;
-typedef int64_t int64;
-typedef u_int64_t uint64;
-#elif 0 // Windows
-typedef __int32 int32;
-typedef unsigned __int32 uint32;
-typedef __int64 int64;
-typedef unsigned __int64 uint64;
-#else
-# error Do not know how to define a 32-bit integer quantity on your system
-#endif
-
-} // namespace GFLAGS_NAMESPACE
-
-
-namespace fLS {
-
-// The meaning of "string" might be different between now and when the
-// macros below get invoked (e.g., if someone is experimenting with
-// other string implementations that get defined after this file is
-// included). Save the current meaning now and use it in the macros.
-typedef std::string clstring;
-
-} // namespace fLS
-
-
-#define DECLARE_VARIABLE(type, shorttype, name) \
- /* We always want to import declared variables, dll or no */ \
- namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
- using fL##shorttype::FLAGS_##name
-
-#define DECLARE_bool(name) \
- DECLARE_VARIABLE(bool, B, name)
-
-#define DECLARE_int32(name) \
- DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
-
-#define DECLARE_int64(name) \
- DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
-
-#define DECLARE_uint64(name) \
- DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
-
-#define DECLARE_double(name) \
- DECLARE_VARIABLE(double, D, name)
-
-#define DECLARE_string(name) \
- /* We always want to import declared variables, dll or no */ \
- namespace fLS { \
- using ::fLS::clstring; \
- extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
- } \
- using fLS::FLAGS_##name
-
-
-#endif // GFLAGS_DECLARE_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
deleted file mode 100644
index 0c17825d..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2014, Andreas Schuh
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// -----------------------------------------------------------------------------
-// Imports the gflags library symbols into an alternative/deprecated namespace.
-
-#ifndef GFLAGS_GFLAGS_H_
-# error The internal header gflags_gflags.h may only be included by gflags.h
-#endif
-
-#ifndef GFLAGS_NS_GFLAGS_H_
-#define GFLAGS_NS_GFLAGS_H_
-
-
-namespace gflags {
-
-
-using GFLAGS_NAMESPACE::int32;
-using GFLAGS_NAMESPACE::uint32;
-using GFLAGS_NAMESPACE::int64;
-using GFLAGS_NAMESPACE::uint64;
-
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::CommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetAllFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
-using GFLAGS_NAMESPACE::DescribeOneFlag;
-using GFLAGS_NAMESPACE::SetArgv;
-using GFLAGS_NAMESPACE::GetArgvs;
-using GFLAGS_NAMESPACE::GetArgv;
-using GFLAGS_NAMESPACE::GetArgv0;
-using GFLAGS_NAMESPACE::GetArgvSum;
-using GFLAGS_NAMESPACE::ProgramInvocationName;
-using GFLAGS_NAMESPACE::ProgramInvocationShortName;
-using GFLAGS_NAMESPACE::ProgramUsage;
-using GFLAGS_NAMESPACE::VersionString;
-using GFLAGS_NAMESPACE::GetCommandLineOption;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
-using GFLAGS_NAMESPACE::FlagSettingMode;
-using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
-using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
-using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
-using GFLAGS_NAMESPACE::SetCommandLineOption;
-using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
-using GFLAGS_NAMESPACE::FlagSaver;
-using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
-using GFLAGS_NAMESPACE::ReadFlagsFromString;
-using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
-using GFLAGS_NAMESPACE::ReadFromFlagsFile;
-using GFLAGS_NAMESPACE::BoolFromEnv;
-using GFLAGS_NAMESPACE::Int32FromEnv;
-using GFLAGS_NAMESPACE::Int64FromEnv;
-using GFLAGS_NAMESPACE::Uint64FromEnv;
-using GFLAGS_NAMESPACE::DoubleFromEnv;
-using GFLAGS_NAMESPACE::StringFromEnv;
-using GFLAGS_NAMESPACE::SetUsageMessage;
-using GFLAGS_NAMESPACE::SetVersionString;
-using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
-using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
-using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
-using GFLAGS_NAMESPACE::FlagRegisterer;
-
-#ifndef SWIG
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-#endif
-
-
-} // namespace gflags
-
-
-#endif // GFLAGS_NS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/posix/include/private/config.h b/files/third_party/gflags/gen/posix/include/private/config.h
deleted file mode 100644
index 592d61c4..00000000
--- a/files/third_party/gflags/gen/posix/include/private/config.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Generated from config.h.in during build configuration using CMake. */
-
-// Note: This header file is only used internally. It is not part of public interface!
-
-// ---------------------------------------------------------------------------
-// System checks
-
-// Define if you build this library for a MS Windows OS.
-/* #undef OS_WINDOWS */
-
-// Define if you have the <stdint.h> header file.
-#define HAVE_STDINT_H
-
-// Define if you have the <sys/types.h> header file.
-#define HAVE_SYS_TYPES_H
-
-// Define if you have the <inttypes.h> header file.
-#define HAVE_INTTYPES_H
-
-// Define if you have the <sys/stat.h> header file.
-#define HAVE_SYS_STAT_H
-
-// Define if you have the <unistd.h> header file.
-#define HAVE_UNISTD_H
-
-// Define if you have the <fnmatch.h> header file.
-#define HAVE_FNMATCH_H
-
-// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
-/* #undef HAVE_SHLWAPI_H */
-
-// Define if you have the strtoll function.
-#define HAVE_STRTOLL
-
-// Define if you have the strtoq function.
-/* #undef HAVE_STRTOQ */
-
-// Define if you have the <pthread.h> header file.
-#define HAVE_PTHREAD
-
-// Define if your pthread library defines the type pthread_rwlock_t
-#define HAVE_RWLOCK
-
-// gcc requires this to get PRId64, etc.
-#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
-# define __STDC_FORMAT_MACROS 1
-#endif
-
-// ---------------------------------------------------------------------------
-// Package information
-
-// Name of package.
-#define PACKAGE gflags
-
-// Define to the full name of this package.
-#define PACKAGE_NAME gflags
-
-// Define to the full name and version of this package.
-#define PACKAGE_STRING gflags 2.2.0
-
-// Define to the one symbol short name of this package.
-#define PACKAGE_TARNAME gflags-2.2.0
-
-// Define to the version of this package.
-#define PACKAGE_VERSION 2.2.0
-
-// Version number of package.
-#define VERSION PACKAGE_VERSION
-
-// Define to the address where bug reports for this package should be sent.
-#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
-
-// ---------------------------------------------------------------------------
-// Path separator
-#ifndef PATH_SEPARATOR
-# ifdef OS_WINDOWS
-# define PATH_SEPARATOR '\\'
-# else
-# define PATH_SEPARATOR '/'
-# endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Windows
-
-// Whether gflags library is a DLL.
-#ifndef GFLAGS_IS_A_DLL
-# define GFLAGS_IS_A_DLL 0
-#endif
-
-// Always export symbols when compiling a shared library as this file is only
-// included by internal modules when building the gflags library itself.
-// The gflags_declare.h header file will set it to import these symbols otherwise.
-#ifndef GFLAGS_DLL_DECL
-# if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-# define GFLAGS_DLL_DECL __declspec(dllexport)
-# else
-# define GFLAGS_DLL_DECL
-# endif
-#endif
-// Flags defined by the gflags library itself must be exported
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-# define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
-#endif
-
-#ifdef OS_WINDOWS
-// The unittests import the symbols of the shared gflags library
-# if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-# define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
-# endif
-# include "windows_port.h"
-#endif
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags.h b/files/third_party/gflags/gen/win/include/gflags/gflags.h
deleted file mode 100644
index 357eec6b..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags.h
+++ /dev/null
@@ -1,573 +0,0 @@
-// Copyright (c) 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// or defines a command line flag or wants to parse command line flags
-// or print a program usage message (which will include information about
-// flags). Executive summary, in the form of an example foo.cc file:
-//
-// #include "foo.h" // foo.h has a line "DECLARE_int32(start);"
-// #include "validators.h" // hypothetical file defining ValidateIsFile()
-//
-// DEFINE_int32(end, 1000, "The last record to read");
-//
-// DEFINE_string(filename, "my_file.txt", "The file to read");
-// // Crash if the specified file does not exist.
-// static bool dummy = RegisterFlagValidator(&FLAGS_filename,
-// &ValidateIsFile);
-//
-// DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
-//
-// void MyFunc() {
-// if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
-// }
-//
-// Then, at the command-line:
-// ./foo --noverbose --start=5 --end=100
-//
-// For more details, see
-// doc/gflags.html
-//
-// --- A note about thread-safety:
-//
-// We describe many functions in this routine as being thread-hostile,
-// thread-compatible, or thread-safe. Here are the meanings we use:
-//
-// thread-safe: it is safe for multiple threads to call this routine
-// (or, when referring to a class, methods of this class)
-// concurrently.
-// thread-hostile: it is not safe for multiple threads to call this
-// routine (or methods of this class) concurrently. In gflags,
-// most thread-hostile routines are intended to be called early in,
-// or even before, main() -- that is, before threads are spawned.
-// thread-compatible: it is safe for multiple threads to read from
-// this variable (when applied to variables), or to call const
-// methods of this class (when applied to classes), as long as no
-// other thread is writing to the variable or calling non-const
-// methods of this class.
-
-#ifndef GFLAGS_GFLAGS_H_
-#define GFLAGS_GFLAGS_H_
-
-#include <string>
-#include <vector>
-
-#include "gflags_declare.h" // IWYU pragma: export
-
-
-// We always want to export variables defined in user code
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-# ifdef _MSC_VER
-# define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
-# else
-# define GFLAGS_DLL_DEFINE_FLAG
-# endif
-#endif
-
-
-namespace GFLAGS_NAMESPACE {
-
-
-// --------------------------------------------------------------------
-// To actually define a flag in a file, use DEFINE_bool,
-// DEFINE_string, etc. at the bottom of this file. You may also find
-// it useful to register a validator with the flag. This ensures that
-// when the flag is parsed from the commandline, or is later set via
-// SetCommandLineOption, we call the validation function. It is _not_
-// called when you assign the value to the flag directly using the = operator.
-//
-// The validation function should return true if the flag value is valid, and
-// false otherwise. If the function returns false for the new setting of the
-// flag, the flag will retain its current value. If it returns false for the
-// default value, ParseCommandLineFlags() will die.
-//
-// This function is safe to call at global construct time (as in the
-// example below).
-//
-// Example use:
-// static bool ValidatePort(const char* flagname, int32 value) {
-// if (value > 0 && value < 32768) // value is ok
-// return true;
-// printf("Invalid value for --%s: %d\n", flagname, (int)value);
-// return false;
-// }
-// DEFINE_int32(port, 0, "What port to listen on");
-// static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
-
-// Returns true if successfully registered, false if not (because the
-// first argument doesn't point to a command-line flag, or because a
-// validator is already registered for this flag).
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool* flag, bool (*validate_fn)(const char*, bool));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32* flag, bool (*validate_fn)(const char*, int32));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64* flag, bool (*validate_fn)(const char*, int64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64* flag, bool (*validate_fn)(const char*, uint64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double* flag, bool (*validate_fn)(const char*, double));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
-
-// Convenience macro for the registration of a flag validator
-#define DEFINE_validator(name, validator) \
- static const bool name##_validator_registered = \
- GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
-
-
-// --------------------------------------------------------------------
-// These methods are the best way to get access to info about the
-// list of commandline flags. Note that these routines are pretty slow.
-// GetAllFlags: mostly-complete info about the list, sorted by file.
-// ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
-// ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
-//
-// In addition to accessing flags, you can also access argv[0] (the program
-// name) and argv (the entire commandline), which we sock away a copy of.
-// These variables are static, so you should only set them once.
-//
-// No need to export this data only structure from DLL, avoiding VS warning 4251.
-struct CommandLineFlagInfo {
- std::string name; // the name of the flag
- std::string type; // the type of the flag: int32, etc
- std::string description; // the "help text" associated with the flag
- std::string current_value; // the current value, as a string
- std::string default_value; // the default value, as a string
- std::string filename; // 'cleaned' version of filename holding the flag
- bool has_validator_fn; // true if RegisterFlagValidator called on this flag
- bool is_default; // true if the flag has the default value and
- // has not been set explicitly from the cmdline
- // or via SetCommandLineOption
- const void* flag_ptr; // pointer to the flag's current value (i.e. FLAGS_foo)
-};
-
-// Using this inside of a validator is a recipe for a deadlock.
-// TODO(user) Fix locking when validators are running, to make it safe to
-// call validators during ParseAllFlags.
-// Also make sure then to uncomment the corresponding unit test in
-// gflags_unittest.sh
-extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
-// These two are actually defined in gflags_reporting.cc.
-extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0); // what --help does
-extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
-
-// Create a descriptive string for a flag.
-// Goes to some trouble to make pretty line breaks.
-extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
-
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
-
-// The following functions are thread-safe as long as SetArgv() is
-// only called before any threads start.
-extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
-extern GFLAGS_DLL_DECL const char* GetArgv(); // all of argv as a string
-extern GFLAGS_DLL_DECL const char* GetArgv0(); // only argv0
-extern GFLAGS_DLL_DECL uint32 GetArgvSum(); // simple checksum of argv
-extern GFLAGS_DLL_DECL const char* ProgramInvocationName(); // argv0, or "UNKNOWN" if not set
-extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName(); // basename(argv0)
-
-// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* ProgramUsage(); // string set by SetUsageMessage()
-
-// VersionString() is thread-safe as long as SetVersionString() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* VersionString(); // string set by SetVersionString()
-
-
-
-// --------------------------------------------------------------------
-// Normally you access commandline flags by just saying "if (FLAGS_foo)"
-// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
-// commonly, via the DEFINE_foo macro). But if you need a bit more
-// control, we have programmatic ways to get/set the flags as well.
-// These programmatic ways to access flags are thread-safe, but direct
-// access is only thread-compatible.
-
-// Return true iff the flagname was found.
-// OUTPUT is set to the flag's value, or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
-
-// Return true iff the flagname was found. OUTPUT is set to the flag's
-// CommandLineFlagInfo or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
-
-// Return the CommandLineFlagInfo of the flagname. exit() if name not found.
-// Example usage, to check if a flag's value is currently the default value:
-// if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
-extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
-
-enum GFLAGS_DLL_DECL FlagSettingMode {
- // update the flag's value (can call this multiple times).
- SET_FLAGS_VALUE,
- // update the flag's value, but *only if* it has not yet been updated
- // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
- SET_FLAG_IF_DEFAULT,
- // set the flag's default value to this. If the flag has not yet updated
- // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
- // change the flag's current value to the new default value as well.
- SET_FLAGS_DEFAULT
-};
-
-// Set a particular flag ("command line option"). Returns a string
-// describing the new value that the option has been set to. The
-// return value API is not well-specified, so basically just depend on
-// it to be empty if the setting failed for some reason -- the name is
-// not a valid flag name, or the value is not a valid value -- and
-// non-empty else.
-
-// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
-extern GFLAGS_DLL_DECL std::string SetCommandLineOption (const char* name, const char* value);
-extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
-
-
-// --------------------------------------------------------------------
-// Saves the states (value, default value, whether the user has set
-// the flag, registered validators, etc) of all flags, and restores
-// them when the FlagSaver is destroyed. This is very useful in
-// tests, say, when you want to let your tests change the flags, but
-// make sure that they get reverted to the original states when your
-// test is complete.
-//
-// Example usage:
-// void TestFoo() {
-// FlagSaver s1;
-// FLAG_foo = false;
-// FLAG_bar = "some value";
-//
-// // test happens here. You can return at any time
-// // without worrying about restoring the FLAG values.
-// }
-//
-// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
-// the work is done in the constructor and destructor, so in the standard
-// usage example above, the compiler would complain that it's an
-// unused variable.
-//
-// This class is thread-safe. However, its destructor writes to
-// exactly the set of flags that have changed value during its
-// lifetime, so concurrent _direct_ access to those flags
-// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
-
-class GFLAGS_DLL_DECL FlagSaver {
- public:
- FlagSaver();
- ~FlagSaver();
-
- private:
- class FlagSaverImpl* impl_; // we use pimpl here to keep API steady
-
- FlagSaver(const FlagSaver&); // no copying!
- void operator=(const FlagSaver&);
-};
-
-// --------------------------------------------------------------------
-// Some deprecated or hopefully-soon-to-be-deprecated functions.
-
-// This is often used for logging. TODO(csilvers): figure out a better way
-extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
-// Usually where this is used, a FlagSaver should be used instead.
-extern GFLAGS_DLL_DECL
-bool ReadFlagsFromString(const std::string& flagfilecontents,
- const char* prog_name,
- bool errors_are_fatal); // uses SET_FLAGS_VALUE
-
-// These let you manually implement --flagfile functionality.
-// DEPRECATED.
-extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
-extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal); // uses SET_FLAGS_VALUE
-
-
-// --------------------------------------------------------------------
-// Useful routines for initializing flags from the environment.
-// In each case, if 'varname' does not exist in the environment
-// return defval. If 'varname' does exist but is not valid
-// (e.g., not a number for an int32 flag), abort with an error.
-// Otherwise, return the value. NOTE: for booleans, for true use
-// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
-
-extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
-extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
-extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
-extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
-extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
-extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
-
-
-// --------------------------------------------------------------------
-// The next two functions parse gflags from main():
-
-// Set the "usage" message for this program. For example:
-// string usage("This program does nothing. Sample usage:\n");
-// usage += argv[0] + " <uselessarg1> <uselessarg2>";
-// SetUsageMessage(usage);
-// Do not include commandline flags in the usage: we do that for you!
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
-
-// Sets the version string, which is emitted with --version.
-// For instance: SetVersionString("1.3");
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
-
-
-// Looks for flags in argv and parses them. Rearranges argv to put
-// flags first, or removes them entirely if remove_flags is true.
-// If a flag is defined more than once in the command line or flag
-// file, the last definition is used. Returns the index (into argv)
-// of the first non-flag argument.
-// See top-of-file for more details on this function.
-#ifndef SWIG // In swig, use ParseCommandLineFlagsScript() instead.
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
-#endif
-
-
-// Calls to ParseCommandLineNonHelpFlags and then to
-// HandleCommandLineHelpFlags can be used instead of a call to
-// ParseCommandLineFlags during initialization, in order to allow for
-// changing default values for some FLAGS (via
-// e.g. SetCommandLineOptionWithMode calls) between the time of
-// command line parsing and the time of dumping help information for
-// the flags as a result of command line parsing. If a flag is
-// defined more than once in the command line or flag file, the last
-// definition is used. Returns the index (into argv) of the first
-// non-flag argument. (If remove_flags is true, will always return 1.)
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
-
-// This is actually defined in gflags_reporting.cc.
-// This function is misnamed (it also handles --version, etc.), but
-// it's too late to change that now. :-(
-extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags(); // in gflags_reporting.cc
-
-// Allow command line reparsing. Disables the error normally
-// generated when an unknown flag is found, since it may be found in a
-// later parse. Thread-hostile; meant to be called before any threads
-// are spawned.
-extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
-
-// Reparse the flags that have not yet been recognized. Only flags
-// registered since the last parse will be recognized. Any flag value
-// must be provided as part of the argument using "=", not as a
-// separate command line argument that follows the flag argument.
-// Intended for handling flags from dynamically loaded libraries,
-// since their flags are not registered until they are loaded.
-extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
-
-// Clean up memory allocated by flags. This is only needed to reduce
-// the quantity of "potentially leaked" reports emitted by memory
-// debugging tools such as valgrind. It is not required for normal
-// operation, or for the google perftools heap-checker. It must only
-// be called when the process is about to exit, and all threads that
-// might access flags are quiescent. Referencing flags after this is
-// called will have unexpected consequences. This is not safe to run
-// when multiple threads might be running: the function is
-// thread-hostile.
-extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
-
-
-// --------------------------------------------------------------------
-// Now come the command line flag declaration/definition macros that
-// will actually be used. They're kind of hairy. A major reason
-// for this is initialization: we want people to be able to access
-// variables in global constructors and have that not crash, even if
-// their global constructor runs before the global constructor here.
-// (Obviously, we can't guarantee the flags will have the correct
-// default value in that case, but at least accessing them is safe.)
-// The only way to do that is have flags point to a static buffer.
-// So we make one, using a union to ensure proper alignment, and
-// then use placement-new to actually set up the flag with the
-// correct default value. In the same vein, we have to worry about
-// flag access in global destructors, so FlagRegisterer has to be
-// careful never to destroy the flag-values it constructs.
-//
-// Note that when we define a flag variable FLAGS_<name>, we also
-// preemptively define a junk variable, FLAGS_no<name>. This is to
-// cause a link-time error if someone tries to define 2 flags with
-// names like "logging" and "nologging". We do this because a bool
-// flag FLAG can be set from the command line to true with a "-FLAG"
-// argument, and to false with a "-noFLAG" argument, and so this can
-// potentially avert confusion.
-//
-// We also put flags into their own namespace. It is purposefully
-// named in an opaque way that people should have trouble typing
-// directly. The idea is that DEFINE puts the flag in the weird
-// namespace, and DECLARE imports the flag from there into the current
-// namespace. The net result is to force people to use DECLARE to get
-// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
-// or some such instead. We want this so we can put extra
-// functionality (like sanity-checking) in DECLARE if we want, and
-// make sure it is picked up everywhere.
-//
-// We also put the type of the variable in the namespace, so that
-// people can't DECLARE_int32 something that they DEFINE_bool'd
-// elsewhere.
-
-class GFLAGS_DLL_DECL FlagRegisterer {
- public:
- FlagRegisterer(const char* name, const char* type,
- const char* help, const char* filename,
- void* current_storage, void* defvalue_storage);
-};
-
-// If your application #defines STRIP_FLAG_HELP to a non-zero value
-// before #including this file, we remove the help message from the
-// binary file. This can reduce the size of the resulting binary
-// somewhat, and may also be useful for security reasons.
-
-extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
-
-
-} // namespace GFLAGS_NAMESPACE
-
-
-#ifndef SWIG // In swig, ignore the main flag declarations
-
-#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
-// Need this construct to avoid the 'defined but not used' warning.
-#define MAYBE_STRIPPED_HELP(txt) \
- (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
-#else
-#define MAYBE_STRIPPED_HELP(txt) txt
-#endif
-
-// Each command-line flag has two variables associated with it: one
-// with the current value, and one with the default value. However,
-// we have a third variable, which is where value is assigned; it's a
-// constant. This guarantees that FLAG_##value is initialized at
-// static initialization time (e.g. before program-start) rather than
-// than global construction time (which is after program-start but
-// before main), at least when 'value' is a compile-time constant. We
-// use a small trick for the "default value" variable, and call it
-// FLAGS_no<name>. This serves the second purpose of assuring a
-// compile error if someone tries to define a flag named no<name>
-// which is illegal (--foo and --nofoo both affect the "foo" flag).
-#define DEFINE_VARIABLE(type, shorttype, name, value, help) \
- namespace fL##shorttype { \
- static const type FLAGS_nono##name = value; \
- /* We always want to export defined variables, dll or no */ \
- GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name; \
- type FLAGS_no##name = FLAGS_nono##name; \
- static GFLAGS_NAMESPACE::FlagRegisterer o_##name( \
- #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__, \
- &FLAGS_##name, &FLAGS_no##name); \
- } \
- using fL##shorttype::FLAGS_##name
-
-// For DEFINE_bool, we want to do the extra check that the passed-in
-// value is actually a bool, and not a string or something that can be
-// coerced to a bool. These declarations (no definition needed!) will
-// help us do that, and never evaluate From, which is important.
-// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
-// that the compiler have different sizes for bool & double. Since
-// this is not guaranteed by the standard, we check it with a
-// COMPILE_ASSERT.
-namespace fLB {
-struct CompileAssert {};
-typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
- (sizeof(double) != sizeof(bool)) ? 1 : -1];
-template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
-GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
-} // namespace fLB
-
-// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
-// are in a separate include, gflags_declare.h, for reducing
-// the physical transitive size for DECLARE use.
-#define DEFINE_bool(name, val, txt) \
- namespace fLB { \
- typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[ \
- (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
- } \
- DEFINE_VARIABLE(bool, B, name, val, txt)
-
-#define DEFINE_int32(name, val, txt) \
- DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
- name, val, txt)
-
-#define DEFINE_int64(name, val, txt) \
- DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
- name, val, txt)
-
-#define DEFINE_uint64(name,val, txt) \
- DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
- name, val, txt)
-
-#define DEFINE_double(name, val, txt) \
- DEFINE_VARIABLE(double, D, name, val, txt)
-
-// Strings are trickier, because they're not a POD, so we can't
-// construct them at static-initialization time (instead they get
-// constructed at global-constructor time, which is much later). To
-// try to avoid crashes in that case, we use a char buffer to store
-// the string, which we can static-initialize, and then placement-new
-// into it later. It's not perfect, but the best we can do.
-
-namespace fLS {
-
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
- const char *value) {
- return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
- const clstring &value) {
- return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
- int value);
-} // namespace fLS
-
-// We need to define a var named FLAGS_no##name so people don't define
-// --string and --nostring. And we need a temporary place to put val
-// so we don't have to evaluate it twice. Two great needs that go
-// great together!
-// The weird 'using' + 'extern' inside the fLS namespace is to work around
-// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10. See
-// http://code.google.com/p/google-gflags/issues/detail?id=20
-#define DEFINE_string(name, val, txt) \
- namespace fLS { \
- using ::fLS::clstring; \
- static union { void* align; char s[sizeof(clstring)]; } s_##name[2]; \
- clstring* const FLAGS_no##name = ::fLS:: \
- dont_pass0toDEFINE_string(s_##name[0].s, \
- val); \
- static GFLAGS_NAMESPACE::FlagRegisterer o_##name( \
- #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__, \
- s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name)); \
- extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name; \
- using fLS::FLAGS_##name; \
- clstring& FLAGS_##name = *FLAGS_no##name; \
- } \
- using fLS::FLAGS_##name
-
-#endif // SWIG
-
-
-// Import gflags library symbols into alternative/deprecated namespace(s)
-#include "gflags_gflags.h"
-
-
-#endif // GFLAGS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h b/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h
deleted file mode 100644
index f951c1e0..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ---
-
-//
-// Implement helpful bash-style command line flag completions
-//
-// ** Functional API:
-// HandleCommandLineCompletions() should be called early during
-// program startup, but after command line flag code has been
-// initialized, such as the beginning of HandleCommandLineHelpFlags().
-// It checks the value of the flag --tab_completion_word. If this
-// flag is empty, nothing happens here. If it contains a string,
-// however, then HandleCommandLineCompletions() will hijack the
-// process, attempting to identify the intention behind this
-// completion. Regardless of the outcome of this deduction, the
-// process will be terminated, similar to --helpshort flag
-// handling.
-//
-// ** Overview of Bash completions:
-// Bash can be told to programatically determine completions for the
-// current 'cursor word'. It does this by (in this case) invoking a
-// command with some additional arguments identifying the command
-// being executed, the word being completed, and the previous word
-// (if any). Bash then expects a sequence of output lines to be
-// printed to stdout. If these lines all contain a common prefix
-// longer than the cursor word, bash will replace the cursor word
-// with that common prefix, and display nothing. If there isn't such
-// a common prefix, bash will display the lines in pages using 'more'.
-//
-// ** Strategy taken for command line completions:
-// If we can deduce either the exact flag intended, or a common flag
-// prefix, we'll output exactly that. Otherwise, if information
-// must be displayed to the user, we'll take the opportunity to add
-// some helpful information beyond just the flag name (specifically,
-// we'll include the default flag value and as much of the flag's
-// description as can fit on a single terminal line width, as specified
-// by the flag --tab_completion_columns). Furthermore, we'll try to
-// make bash order the output such that the most useful or relevent
-// flags are the most likely to be shown at the top.
-//
-// ** Additional features:
-// To assist in finding that one really useful flag, substring matching
-// was implemented. Before pressing a <TAB> to get completion for the
-// current word, you can append one or more '?' to the flag to do
-// substring matching. Here's the semantics:
-// --foo<TAB> Show me all flags with names prefixed by 'foo'
-// --foo?<TAB> Show me all flags with 'foo' somewhere in the name
-// --foo??<TAB> Same as prior case, but also search in module
-// definition path for 'foo'
-// --foo???<TAB> Same as prior case, but also search in flag
-// descriptions for 'foo'
-// Finally, we'll trim the output to a relatively small number of
-// flags to keep bash quiet about the verbosity of output. If one
-// really wanted to see all possible matches, appending a '+' to the
-// search word will force the exhaustive list of matches to be printed.
-//
-// ** How to have bash accept completions from a binary:
-// Bash requires that it be informed about each command that programmatic
-// completion should be enabled for. Example addition to a .bashrc
-// file would be (your path to gflags_completions.sh file may differ):
-
-/*
-$ complete -o bashdefault -o default -o nospace -C \
- '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
- time env binary_name another_binary [...]
-*/
-
-// This would allow the following to work:
-// $ /path/to/binary_name --vmodule<TAB>
-// Or:
-// $ ./bin/path/another_binary --gfs_u<TAB>
-// (etc)
-//
-// Sadly, it appears that bash gives no easy way to force this behavior for
-// all commands. That's where the "time" in the above example comes in.
-// If you haven't specifically added a command to the list of completion
-// supported commands, you can still get completions by prefixing the
-// entire command with "env".
-// $ env /some/brand/new/binary --vmod<TAB>
-// Assuming that "binary" is a newly compiled binary, this should still
-// produce the expected completion output.
-
-
-#ifndef GFLAGS_COMPLETIONS_H_
-#define GFLAGS_COMPLETIONS_H_
-
-namespace google {
-
-extern void HandleCommandLineCompletions(void);
-
-}
-
-#endif // GFLAGS_COMPLETIONS_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h b/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h
deleted file mode 100644
index fbc8466f..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 1999, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-//
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// command line flag.
-
-#ifndef GFLAGS_DECLARE_H_
-#define GFLAGS_DECLARE_H_
-
-
-// ---------------------------------------------------------------------------
-// Namespace of gflags library symbols.
-#define GFLAGS_NAMESPACE google
-
-// ---------------------------------------------------------------------------
-// Windows DLL import/export.
-
-// We always want to import the symbols of the gflags library
-#ifndef GFLAGS_DLL_DECL
-# if 0 && defined(_MSC_VER)
-# define GFLAGS_DLL_DECL __declspec(dllimport)
-# else
-# define GFLAGS_DLL_DECL
-# endif
-#endif
-
-// We always want to import variables declared in user code
-#ifndef GFLAGS_DLL_DECLARE_FLAG
-# ifdef _MSC_VER
-# define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
-# else
-# define GFLAGS_DLL_DECLARE_FLAG
-# endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Flag types
-#include <string>
-#if 1
-# include <stdint.h> // the normal place uint32_t is defined
-#elif 1
-# include <sys/types.h> // the normal place u_int32_t is defined
-#elif 0
-# include <inttypes.h> // a third place for uint32_t or u_int32_t
-#endif
-
-namespace GFLAGS_NAMESPACE {
-
-#if 0 // C99
-typedef int32_t int32;
-typedef uint32_t uint32;
-typedef int64_t int64;
-typedef uint64_t uint64;
-#elif 0 // BSD
-typedef int32_t int32;
-typedef u_int32_t uint32;
-typedef int64_t int64;
-typedef u_int64_t uint64;
-#elif 1 // Windows
-typedef __int32 int32;
-typedef unsigned __int32 uint32;
-typedef __int64 int64;
-typedef unsigned __int64 uint64;
-#else
-# error Do not know how to define a 32-bit integer quantity on your system
-#endif
-
-} // namespace GFLAGS_NAMESPACE
-
-
-namespace fLS {
-
-// The meaning of "string" might be different between now and when the
-// macros below get invoked (e.g., if someone is experimenting with
-// other string implementations that get defined after this file is
-// included). Save the current meaning now and use it in the macros.
-typedef std::string clstring;
-
-} // namespace fLS
-
-
-#define DECLARE_VARIABLE(type, shorttype, name) \
- /* We always want to import declared variables, dll or no */ \
- namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
- using fL##shorttype::FLAGS_##name
-
-#define DECLARE_bool(name) \
- DECLARE_VARIABLE(bool, B, name)
-
-#define DECLARE_int32(name) \
- DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
-
-#define DECLARE_int64(name) \
- DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
-
-#define DECLARE_uint64(name) \
- DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
-
-#define DECLARE_double(name) \
- DECLARE_VARIABLE(double, D, name)
-
-#define DECLARE_string(name) \
- /* We always want to import declared variables, dll or no */ \
- namespace fLS { \
- using ::fLS::clstring; \
- extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
- } \
- using fLS::FLAGS_##name
-
-
-#endif // GFLAGS_DECLARE_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h b/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h
deleted file mode 100644
index 0c17825d..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2014, Andreas Schuh
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// -----------------------------------------------------------------------------
-// Imports the gflags library symbols into an alternative/deprecated namespace.
-
-#ifndef GFLAGS_GFLAGS_H_
-# error The internal header gflags_gflags.h may only be included by gflags.h
-#endif
-
-#ifndef GFLAGS_NS_GFLAGS_H_
-#define GFLAGS_NS_GFLAGS_H_
-
-
-namespace gflags {
-
-
-using GFLAGS_NAMESPACE::int32;
-using GFLAGS_NAMESPACE::uint32;
-using GFLAGS_NAMESPACE::int64;
-using GFLAGS_NAMESPACE::uint64;
-
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::CommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetAllFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
-using GFLAGS_NAMESPACE::DescribeOneFlag;
-using GFLAGS_NAMESPACE::SetArgv;
-using GFLAGS_NAMESPACE::GetArgvs;
-using GFLAGS_NAMESPACE::GetArgv;
-using GFLAGS_NAMESPACE::GetArgv0;
-using GFLAGS_NAMESPACE::GetArgvSum;
-using GFLAGS_NAMESPACE::ProgramInvocationName;
-using GFLAGS_NAMESPACE::ProgramInvocationShortName;
-using GFLAGS_NAMESPACE::ProgramUsage;
-using GFLAGS_NAMESPACE::VersionString;
-using GFLAGS_NAMESPACE::GetCommandLineOption;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
-using GFLAGS_NAMESPACE::FlagSettingMode;
-using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
-using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
-using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
-using GFLAGS_NAMESPACE::SetCommandLineOption;
-using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
-using GFLAGS_NAMESPACE::FlagSaver;
-using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
-using GFLAGS_NAMESPACE::ReadFlagsFromString;
-using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
-using GFLAGS_NAMESPACE::ReadFromFlagsFile;
-using GFLAGS_NAMESPACE::BoolFromEnv;
-using GFLAGS_NAMESPACE::Int32FromEnv;
-using GFLAGS_NAMESPACE::Int64FromEnv;
-using GFLAGS_NAMESPACE::Uint64FromEnv;
-using GFLAGS_NAMESPACE::DoubleFromEnv;
-using GFLAGS_NAMESPACE::StringFromEnv;
-using GFLAGS_NAMESPACE::SetUsageMessage;
-using GFLAGS_NAMESPACE::SetVersionString;
-using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
-using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
-using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
-using GFLAGS_NAMESPACE::FlagRegisterer;
-
-#ifndef SWIG
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-#endif
-
-
-} // namespace gflags
-
-
-#endif // GFLAGS_NS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/win/include/private/config.h b/files/third_party/gflags/gen/win/include/private/config.h
deleted file mode 100644
index d541580e..00000000
--- a/files/third_party/gflags/gen/win/include/private/config.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Generated from config.h.in during build configuration using CMake. */
-
-// Note: This header file is only used internally. It is not part of public interface!
-
-// ---------------------------------------------------------------------------
-// System checks
-
-// Define if you build this library for a MS Windows OS.
-#define OS_WINDOWS
-
-// Define if you have the <stdint.h> header file.
-#define HAVE_STDINT_H
-
-// Define if you have the <sys/types.h> header file.
-#define HAVE_SYS_TYPES_H
-
-// Define if you have the <inttypes.h> header file.
-/* #undef HAVE_INTTYPES_H */
-
-// Define if you have the <sys/stat.h> header file.
-#define HAVE_SYS_STAT_H
-
-// Define if you have the <unistd.h> header file.
-/* #undef HAVE_UNISTD_H */
-
-// Define if you have the <fnmatch.h> header file.
-/* #undef HAVE_FNMATCH_H */
-
-// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
-#define HAVE_SHLWAPI_H
-
-// Define if you have the strtoll function.
-/* #undef HAVE_STRTOLL */
-
-// Define if you have the strtoq function.
-/* #undef HAVE_STRTOQ */
-
-// Define if you have the <pthread.h> header file.
-/* #undef HAVE_PTHREAD */
-
-// Define if your pthread library defines the type pthread_rwlock_t
-/* #undef HAVE_RWLOCK */
-
-// gcc requires this to get PRId64, etc.
-#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
-# define __STDC_FORMAT_MACROS 1
-#endif
-
-// ---------------------------------------------------------------------------
-// Package information
-
-// Name of package.
-#define PACKAGE gflags
-
-// Define to the full name of this package.
-#define PACKAGE_NAME gflags
-
-// Define to the full name and version of this package.
-#define PACKAGE_STRING gflags 2.2.0
-
-// Define to the one symbol short name of this package.
-#define PACKAGE_TARNAME gflags-2.2.0
-
-// Define to the version of this package.
-#define PACKAGE_VERSION 2.2.0
-
-// Version number of package.
-#define VERSION PACKAGE_VERSION
-
-// Define to the address where bug reports for this package should be sent.
-#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
-
-// ---------------------------------------------------------------------------
-// Path separator
-#ifndef PATH_SEPARATOR
-# ifdef OS_WINDOWS
-# define PATH_SEPARATOR '\\'
-# else
-# define PATH_SEPARATOR '/'
-# endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Windows
-
-// Whether gflags library is a DLL.
-#ifndef GFLAGS_IS_A_DLL
-# define GFLAGS_IS_A_DLL 0
-#endif
-
-// Always export symbols when compiling a shared library as this file is only
-// included by internal modules when building the gflags library itself.
-// The gflags_declare.h header file will set it to import these symbols otherwise.
-#ifndef GFLAGS_DLL_DECL
-# if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-# define GFLAGS_DLL_DECL __declspec(dllexport)
-# else
-# define GFLAGS_DLL_DECL
-# endif
-#endif
-// Flags defined by the gflags library itself must be exported
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-# define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
-#endif
-
-#ifdef OS_WINDOWS
-// The unittests import the symbols of the shared gflags library
-# if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-# define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
-# endif
-# include "windows_port.h"
-#endif
diff --git a/files/third_party/gflags/gflags.gyp b/files/third_party/gflags/gflags.gyp
deleted file mode 100644
index 37f2815a..00000000
--- a/files/third_party/gflags/gflags.gyp
+++ /dev/null
@@ -1,92 +0,0 @@
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a copy of WebRTC's gflags.gyp.
-
-{
- 'variables': {
- 'gflags_root': '<(DEPTH)/third_party/gflags',
- 'conditions': [
- ['OS=="win"', {
- 'gflags_gen_arch_root': '<(gflags_root)/gen/win',
- }, {
- 'gflags_gen_arch_root': '<(gflags_root)/gen/posix',
- }],
- ],
- },
- 'targets': [
- {
- 'target_name': 'gflags',
- 'type': 'static_library',
- 'include_dirs': [
- '<(gflags_gen_arch_root)/include/gflags', # For configured files.
- '<(gflags_gen_arch_root)/include/private', # For config.h
- '<(gflags_root)/src/src', # For everything else.
- ],
- 'defines': [
- # These macros exist so flags and symbols are properly
- # exported when building DLLs. Since we don't build DLLs, we
- # need to disable them.
- 'GFLAGS_DLL_DECL=',
- 'GFLAGS_DLL_DECLARE_FLAG=',
- 'GFLAGS_DLL_DEFINE_FLAG=',
- ],
- 'direct_dependent_settings': {
- 'include_dirs': [
- '<(gflags_gen_arch_root)/include', # For configured files.
- '<(gflags_root)/src/src', # For everything else.
- ],
- 'defines': [
- 'GFLAGS_DLL_DECL=',
- 'GFLAGS_DLL_DECLARE_FLAG=',
- 'GFLAGS_DLL_DEFINE_FLAG=',
- ],
- },
- 'sources': [
- 'src/src/gflags.cc',
- 'src/src/gflags_completions.cc',
- 'src/src/gflags_reporting.cc',
- ],
- 'conditions': [
- ['OS=="win"', {
- 'sources': [
- 'src/src/windows_port.cc',
- ],
- 'msvs_disabled_warnings': [
- 4005, # WIN32_LEAN_AND_MEAN redefinition.
- 4267, # Conversion from size_t to "type".
- ],
- 'configurations': {
- 'Common_Base': {
- 'msvs_configuration_attributes': {
- 'CharacterSet': '2', # Use Multi-byte Character Set.
- },
- },
- },
- }],
- # TODO(andrew): Look into fixing this warning upstream:
- # http://code.google.com/p/webrtc/issues/detail?id=760
- ['OS=="win" and clang==1', {
- 'msvs_settings': {
- 'VCCLCompilerTool': {
- 'AdditionalOptions': [
- '-Wno-microsoft-include',
- ],
- },
- },
- }],
- ['clang==1', {
- 'cflags': [
- '-Wno-microsoft-include',
- ],
- }],
- ],
- },
- ],
-}
diff --git a/files/tools/OWNERS b/files/tools/OWNERS
deleted file mode 100644
index f0963525..00000000
--- a/files/tools/OWNERS
+++ /dev/null
@@ -1,61 +0,0 @@
-# You can add new small tools to this directory at your desire, feel free
-# to owners-TBR new folders (assuming you have a regular review already,
-# of course). Include an OWNERS file with at least two people for your new
-# folder.
-# If you're changing existing tools, have your change reviewed by the
-# OWNERS of the existing tool.
-
-dpranke@chromium.org
-scottmg@chromium.org
-thakis@chromium.org
-
-# These aren't actually great contact points for this directory, but
-# changes in this directory are rare and most changes happen in better-owned
-# subdirectories.
-#
-# TEAM: infra-dev@chromium.org
-# COMPONENT: Build
-
-per-file bisect*.py=anantha@chromium.org
-per-file bisect*.py=prasadv@chromium.org
-per-file bisect*.py=robertocn@chromium.org
-per-file run-bisect*.py=prasadv@chromium.org
-per-file run-bisect*.py=robertocn@chromium.org
-per-file prepare-bisect*.py=prasadv@chromium.org
-per-file prepare-bisect*.py=robertocn@chromium.org
-
-per-file boilerplate.py=rsesek@chromium.org
-
-per-file check_git_config.py=iannucci@chromium.org
-per-file check_git_config.py=vadimsh@chromium.org
-
-per-file check_grd_for_unused_strings.py=estade@chromium.org
-
-per-file gyp-explain.py=thakis@chromium.org
-
-per-file gypv8shy.py=jochen@chromium.org
-
-per-file include_tracer.py=thakis@chromium.org
-
-per-file ipc_messages_log.py=yfriedman@chromium.org
-
-per-file licenses.py=file://tools/copyright_scanner/OWNERS
-
-per-file remove_stale_pyc_files.py=dtu@chromium.org
-
-per-file roll_angle.py=kbr@chromium.org
-per-file roll_angle.py=kjellander@chromium.org
-per-file roll_angle.py=geofflang@chromium.org
-per-file roll_webgl_conformance.py=bajones@chromium.org
-per-file roll_webgl_conformance.py=kbr@chromium.org
-per-file roll_webgl_conformance.py=kjellander@chromium.org
-per-file roll_webgl_conformance.py=geofflang@chromium.org
-per-file roll_webgl_conformance.py=zmo@chromium.org
-per-file roll_webrtc.py=kjellander@chromium.org
-
-per-file safely-roll-deps.py=borenet@chromium.org
-
-per-file sort-headers.py=satorux@chromium.org
-per-file sort-sources.py=satorux@chromium.org
-per-file yes_no.py=satorux@chromium.org
-
diff --git a/files/tools/msan/OWNERS b/files/tools/msan/OWNERS
deleted file mode 100644
index ab97cb0f..00000000
--- a/files/tools/msan/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-# pbos@chromium.org
-fbarchard@google.com
-kjellander@google.com
diff --git a/files/tools/msan/blacklist.txt b/files/tools/msan/blacklist.txt
deleted file mode 100644
index 40ea4b83..00000000
--- a/files/tools/msan/blacklist.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# The rules in this file are only applied at compile time. If you can modify the
-# source in question, consider function attributes to disable instrumentation.
-#
-# Please think twice before you add or remove these rules.
-
-# False positive in ffmpeg due to assembly code. http://crbug.com/344505
-fun:ff_get_cpu_flags_x86
-
-# Benign uninits in zlib.
-# http://crbug.com/116277
-fun:*MOZ_Z_deflate*
-# http://crbug.com/418383
-fun:longest_match
-
-# Uninit in zlib with SIMD intrinsic http://crbug.com/426868
-fun:crc_fold512_to_32
-
-# Uninit in OSMesa. http://crbug.com/347967
-fun:unpack_RGBA8888
-fun:unpack_RGB888
-
-# False positives due to use of linux_syscall_support. http://crbug.com/394028
-src:*/breakpad/src/*
-src:*/components/crash/content/app/breakpad_linux.cc
diff --git a/files/tools/ubsan/OWNERS b/files/tools/ubsan/OWNERS
deleted file mode 100644
index 32b7466f..00000000
--- a/files/tools/ubsan/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-# pbos@webrtc.org
-kjellander@google.com
-fbarchard@google.com
diff --git a/files/tools/ubsan/blacklist.txt b/files/tools/ubsan/blacklist.txt
deleted file mode 100644
index e1e3c08a..00000000
--- a/files/tools/ubsan/blacklist.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-#############################################################################
-# UBSan blacklist.
-
-#############################################################################
-# YASM does some funny things that UBsan doesn't like.
-# https://crbug.com/489901
-src:*/third_party/yasm/*
-
-#############################################################################
-# V8 gives too many false positives. Ignore them for now.
-src:*/v8/*
-
-#############################################################################
-# Ignore system libraries.
-src:*/usr/*
-
-#############################################################################
-# V8 UBsan supressions, commented out for now since we are ignorning v8
-# completely.
-# fun:*v8*internal*FastD2I*
-# fun:*v8*internal*ComputeIntegerHash*
-# fun:*v8*internal*ComputeLongHash*
-# fun:*v8*internal*ComputePointerHash*
-# src:*/v8/src/base/bits.cc
-# src:*/v8/src/base/functional.cc
-# Undefined behaviour (integer overflow) is expected but ignored in this
-# function.
-# fun:*JsonParser*ParseJsonNumber*
-
-# Runtime numeric functions.
-# src:*/v8/src/runtime/runtime-numbers.cc
-
-# Shifts of negative numbers
-# fun:*v8*internal*HPositionInfo*TagPosition*
-# fun:*v8*internal*Range*Shl*
-# fun:*v8*internal*RelocInfoWriter*WriteTaggedData*
-
-#############################################################################
-# Undefined arithmetic that can be safely ignored.
-src:*/base/numerics/saturated_arithmetic.h
-src:*/ppapi/shared_impl/id_assignment.h
-
-#############################################################################
-# ICU supressions. Mostly hash functions where integer overflow is OK.
-fun:*hashEntry*
-fun:*LocaleCacheKey*hashCode*
-fun:*google*protobuf*hash*
-fun:*(hash|Hash)*
-
-#############################################################################
-# Bounds blacklist.
-# Array at the end of struct pattern:
-# Maybe UBSan itself can be improved here?
-# e.g.
-# struct blah {
-# int a;
-# char foo[2]; // not actually 2
-# }
-src:*/net/disk_cache/blockfile/backend_impl.cc
-src:*/net/disk_cache/blockfile/entry_impl.cc
-src:*/third_party/icu/source/common/rbbi.cpp
-src:*/third_party/icu/source/common/rbbitblb.cpp
-src:*/third_party/icu/source/common/ucmndata.c
-
-#############################################################################
-# Delete in destructor on a this where this == nullptr
-fun:*re2*RegexpD*
-
-#############################################################################
-# Harmless float division by zero.
-fun:*RendererFrameManager*CullUnlockedFrames*
-
-#############################################################################
-# libc++ __tree and map are not UBSAN clean
-# https://llvm.org/bugs/show_bug.cgi?id=19302
-src:*/third_party/libc\+\+/trunk/include/__tree
-src:*/third_party/libc\+\+/trunk/include/map
diff --git a/files/tools/ubsan/vptr_blacklist.txt b/files/tools/ubsan/vptr_blacklist.txt
deleted file mode 100644
index e8382039..00000000
--- a/files/tools/ubsan/vptr_blacklist.txt
+++ /dev/null
@@ -1,128 +0,0 @@
-#############################################################################
-# UBSan vptr blacklist.
-# Function and type based blacklisting use a mangled name, and it is especially
-# tricky to represent C++ types. For now, any possible changes by name manglings
-# are simply represented as wildcard expressions of regexp, and thus it might be
-# over-blacklisted.
-
-#############################################################################
-# Identical layouts.
-# If base and derived classes have identifical memory layouts (i.e., the same
-# object size) and both have no virtual functions, we blacklist them as there
-# would be not much security implications.
-
-fun:*LifecycleNotifier*addObserver*
-fun:*LifecycleNotifier*removeObserver*
-fun:*toWebInputElement*
-type:*base*MessageLoopForIO*
-type:*BlockRefType*
-type:*SkAutoTUnref*
-type:*WDResult*
-type:*ExecutionContext*
-type:*WebInputElement*
-type:*WebFormControlElement*
-
-# Avoid identical layout cases for 86 different classes in InspectorTypeBuilder,
-# all of which are guarded using COMPILER_ASSERT on the object size. Two more
-# types are also blacklisted due to the template class (JSONArray <-> Array<T>).
-
-src:*InspectorTypeBuilder.h*
-type:*TypeBuilder*
-type:*JSONArray*
-
-#############################################################################
-# Base class's constructor accesses a derived class's member.
-
-fun:*DoublyLinkedListNode*
-type:*content*WebUIExtensionData*
-
-# RenderFrameObserverTracker<T>::RenderFrameObserverTracker()
-fun:*content*RenderFrameObserverTracker*RenderFrame*
-
-# RenderViewObserverTracker<T>::RenderViewObserverTracker()
-fun:*content*RenderViewObserverTracker*RenderView*
-
-#############################################################################
-# Base class's destructor accesses a derived class.
-
-fun:*DatabaseContext*contextDestroyed*
-
-# FIXME: Cannot handle template function LifecycleObserver<>::setContext,
-# so exclude source file for now.
-src:*LifecycleObserver.h*
-
-#############################################################################
-# static_cast into itself in the constructor.
-
-fun:*RefCountedGarbageCollected*makeKeepAlive*
-fun:*ThreadSafeRefCountedGarbageCollected*makeKeepAlive*
-
-#############################################################################
-# Accessing data in destructors where the class has virtual inheritances.
-
-type:*content*RenderWidgetHost*
-
-# Match mangled name for X::~X().
-fun:*content*RenderThreadImplD*
-fun:*content*RenderViewHostImplD*
-fun:*content*UtilityThreadImplD*
-
-#############################################################################
-# Using raw pointer values.
-#
-# A raw pointer value (16) is used to infer the field offset by
-# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET.
-
-src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
-src:*/third_party/protobuf/src/google/protobuf/compiler/cpp/cpp_message.cc
-src:*/third_party/protobuf/src/google/protobuf/descriptor.pb.cc
-
-#############################################################################
-# Avoid link errors.
-# Ubsan vptr needs typeinfo on the target class, but it looks like typeinfo is
-# not avaiable if the class is not exported. For now, simply blacklisted to
-# avoid link errors; e.g., undefined reference to 'typeinfo for [CLASS_NAME]'.
-
-# obj/ppapi/libppapi_proxy.a(obj/ppapi/proxy/ppapi_proxy.proxy_channel.o):../../ppapi/proxy/proxy_channel.cc:__unnamed_53: error: undefined reference to 'typeinfo for IPC::TestSink'
-src:*/ppapi/proxy/proxy_channel.cc
-
-# obj/chrome/libbrowser.a(obj/chrome/browser/net/browser.predictor.o):../../chrome/browser/net/predictor.cc:__unnamed_577: error: undefined reference to 'typeinfo for ProxyAdvisor'
-src:*/chrome/browser/net/predictor.cc
-
-# obj/third_party/pdfium/libfpdfapi.a(obj/third_party/pdfium/core/src/fpdfapi/fpdf_render/fpdfapi.fpdf_render_text.o):../../third_party/pdfium/core/src/fpdfapi/fpdf_render/:__unnamed_360: error: undefined reference to 'typeinfo for CPDF_InlineImages'
-src:*/third_party/pdfium/core/src/fpdfapi/fpdf_render/fpdf_render_text.cpp
-
-# obj/third_party/libwebm/libwebm.a(obj/third_party/libwebm/source/libwebm.mkvmuxer.o)(.data.rel..L__unnamed_2+0x18): error: undefined reference to 'typeinfo for mkvparser::IMkvReader'
-src:*/third_party/libwebm/source/mkvmuxer.cpp
-
-#############################################################################
-# LLVM is not UBSan vptr clean.
-src:*third_party/swiftshader/third_party/LLVM*
-
-#############################################################################
-# UBSan seems to be emit false positives when virtual base classes are
-# involved, see e.g. crbug.com/448102.
-
-type:*v8*internal*OFStream*
-
-#############################################################################
-# UBsan is unable to handle static_cast<A*>(nullptr) and crashes on SIGSEGV.
-#
-
-# static_cast<StartPageService*> in StartPageServiceFactory::GetForProfile.
-type:*StartPageService*
-
-# Remove once function attribute level blacklisting is implemented.
-# See crbug.com/476063.
-fun:*forbidGCDuringConstruction*
-
-#############################################################################
-# UBsan goes into an infinite recursion when __dynamic_cast instrumented with
-# "vptr". See crbug.com/609786.
-
-src:*/third_party/libc\+\+abi/trunk/src/private_typeinfo.cpp
-
-#############################################################################
-# invalid downcasts for IPC messages
-# https://crbug.com/520760
-src:*nacl_message_scanner.cc
diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py
deleted file mode 100755
index 8359d309..00000000
--- a/files/tools_libyuv/autoroller/roll_deps.py
+++ /dev/null
@@ -1,507 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a modified copy of the script in
-# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
-# customized for libyuv.
-
-
-"""Script to automatically roll dependencies in the libyuv DEPS file."""
-
-import argparse
-import base64
-import collections
-import logging
-import os
-import re
-import subprocess
-import sys
-import urllib2
-
-
-# Skip these dependencies (list without solution name prefix).
-DONT_AUTOROLL_THESE = [
- 'src/third_party/gflags/src',
-]
-
-LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
-CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
-CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
-CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
-CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
-
-COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
-ROLL_BRANCH_NAME = 'roll_chromium_revision'
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir,
- os.pardir))
-CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
-
-sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
-import find_depot_tools
-find_depot_tools.add_depot_tools_to_path()
-
-CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
-CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
- 'clang', 'scripts', 'update.py')
-
-DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
-ChangedDep = collections.namedtuple('ChangedDep',
- 'path url current_rev new_rev')
-
-class RollError(Exception):
- pass
-
-
-def VarLookup(local_scope):
- return lambda var_name: local_scope['vars'][var_name]
-
-
-def ParseDepsDict(deps_content):
- local_scope = {}
- global_scope = {
- 'Var': VarLookup(local_scope),
- 'deps_os': {},
- }
- exec(deps_content, global_scope, local_scope)
- return local_scope
-
-
-def ParseLocalDepsFile(filename):
- with open(filename, 'rb') as f:
- deps_content = f.read()
- return ParseDepsDict(deps_content)
-
-
-def ParseRemoteCrDepsFile(revision):
- deps_content = ReadRemoteCrFile('DEPS', revision)
- return ParseDepsDict(deps_content)
-
-
-def ParseCommitPosition(commit_message):
- for line in reversed(commit_message.splitlines()):
- m = COMMIT_POSITION_RE.match(line.strip())
- if m:
- return int(m.group(1))
- logging.error('Failed to parse commit position id from:\n%s\n',
- commit_message)
- sys.exit(-1)
-
-
-def _RunCommand(command, working_dir=None, ignore_exit_code=False,
- extra_env=None):
- """Runs a command and returns the output from that command.
-
- If the command fails (exit code != 0), the function will exit the process.
-
- Returns:
- A tuple containing the stdout and stderr outputs as strings.
- """
- working_dir = working_dir or CHECKOUT_SRC_DIR
- logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
- env = os.environ.copy()
- if extra_env:
- assert all(isinstance(value, str) for value in extra_env.values())
- logging.debug('extra env: %s', extra_env)
- env.update(extra_env)
- p = subprocess.Popen(command, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE, env=env,
- cwd=working_dir, universal_newlines=True)
- std_output = p.stdout.read()
- err_output = p.stderr.read()
- p.wait()
- p.stdout.close()
- p.stderr.close()
- if not ignore_exit_code and p.returncode != 0:
- logging.error('Command failed: %s\n'
- 'stdout:\n%s\n'
- 'stderr:\n%s\n', ' '.join(command), std_output, err_output)
- sys.exit(p.returncode)
- return std_output, err_output
-
-
-def _GetBranches():
- """Returns a tuple of active,branches.
-
- The 'active' is the name of the currently active branch and 'branches' is a
- list of all branches.
- """
- lines = _RunCommand(['git', 'branch'])[0].split('\n')
- branches = []
- active = ''
- for line in lines:
- if '*' in line:
- # The assumption is that the first char will always be the '*'.
- active = line[1:].strip()
- branches.append(active)
- else:
- branch = line.strip()
- if branch:
- branches.append(branch)
- return active, branches
-
-
-def _ReadGitilesContent(url):
- # Download and decode BASE64 content until
- # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
- base64_content = ReadUrlContent(url + '?format=TEXT')
- return base64.b64decode(base64_content[0])
-
-
-def ReadRemoteCrFile(path_below_src, revision):
- """Reads a remote Chromium file of a specific revision. Returns a string."""
- return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision,
- path_below_src))
-
-
-def ReadRemoteCrCommit(revision):
- """Reads a remote Chromium commit message. Returns a string."""
- return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
-
-
-def ReadUrlContent(url):
- """Connect to a remote host and read the contents. Returns a list of lines."""
- conn = urllib2.urlopen(url)
- try:
- return conn.readlines()
- except IOError as e:
- logging.exception('Error connecting to %s. Error: %s', url, e)
- raise
- finally:
- conn.close()
-
-
-def GetMatchingDepsEntries(depsentry_dict, dir_path):
- """Gets all deps entries matching the provided path.
-
- This list may contain more than one DepsEntry object.
- Example: dir_path='src/testing' would give results containing both
- 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS.
- Example 2: dir_path='src/build' should return 'src/build' but not
- 'src/buildtools'.
-
- Returns:
- A list of DepsEntry objects.
- """
- result = []
- for path, depsentry in depsentry_dict.iteritems():
- if path == dir_path:
- result.append(depsentry)
- else:
- parts = path.split('/')
- if all(part == parts[i]
- for i, part in enumerate(dir_path.split('/'))):
- result.append(depsentry)
- return result
-
-
-def BuildDepsentryDict(deps_dict):
- """Builds a dict of paths to DepsEntry objects from a raw parsed deps dict."""
- result = {}
- def AddDepsEntries(deps_subdict):
- for path, deps_url_spec in deps_subdict.iteritems():
- # The deps url is either an URL and a condition, or just the URL.
- if isinstance(deps_url_spec, dict):
- if deps_url_spec.get('dep_type') == 'cipd':
- continue
- deps_url = deps_url_spec['url']
- else:
- deps_url = deps_url_spec
-
- if not result.has_key(path):
- url, revision = deps_url.split('@') if deps_url else (None, None)
- result[path] = DepsEntry(path, url, revision)
-
- AddDepsEntries(deps_dict['deps'])
- for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
- AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
- return result
-
-
-def CalculateChangedDeps(libyuv_deps, new_cr_deps):
- """
- Calculate changed deps entries based on entries defined in the libyuv DEPS
- file:
- - If a shared dependency with the Chromium DEPS file: roll it to the same
- revision as Chromium (i.e. entry in the new_cr_deps dict)
- - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
- this means it may be ahead of the chromium_revision, but generally these
- should be close).
- - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
- unless it's configured to be skipped.
-
- Returns:
- A list of ChangedDep objects representing the changed deps.
- """
- result = []
- libyuv_entries = BuildDepsentryDict(libyuv_deps)
- new_cr_entries = BuildDepsentryDict(new_cr_deps)
- for path, libyuv_deps_entry in libyuv_entries.iteritems():
- if path in DONT_AUTOROLL_THESE:
- continue
- cr_deps_entry = new_cr_entries.get(path)
- if cr_deps_entry:
- # Use the revision from Chromium's DEPS file.
- new_rev = cr_deps_entry.revision
- assert libyuv_deps_entry.url == cr_deps_entry.url, (
- 'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' %
- (path, libyuv_deps_entry.url, cr_deps_entry.url))
- else:
- # Use the HEAD of the deps repo.
- stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url,
- 'HEAD'])
- new_rev = stdout.strip().split('\t')[0]
-
- # Check if an update is necessary.
- if libyuv_deps_entry.revision != new_rev:
- logging.debug('Roll dependency %s to %s', path, new_rev)
- result.append(ChangedDep(path, libyuv_deps_entry.url,
- libyuv_deps_entry.revision, new_rev))
- return sorted(result)
-
-
-def CalculateChangedClang(new_cr_rev):
- def GetClangRev(lines):
- for line in lines:
- match = CLANG_REVISION_RE.match(line)
- if match:
- return match.group(1)
- raise RollError('Could not parse Clang revision!')
-
- with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f:
- current_lines = f.readlines()
- current_rev = GetClangRev(current_lines)
-
- new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
- new_cr_rev).splitlines()
- new_rev = GetClangRev(new_clang_update_py)
- return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
-
-
-def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
- new_commit_pos, changed_deps_list, clang_change):
- current_cr_rev = current_cr_rev[0:10]
- new_cr_rev = new_cr_rev[0:10]
- rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
- git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
-
- commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval,
- git_number_interval)]
- commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
- commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
- rev_interval))
- if changed_deps_list:
- commit_msg.append('Changed dependencies:')
-
- for c in changed_deps_list:
- commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url,
- c.current_rev[0:10],
- c.new_rev[0:10]))
- change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
- commit_msg.append('DEPS diff: %s\n' % change_url)
- else:
- commit_msg.append('No dependencies changed.')
-
- if clang_change.current_rev != clang_change.new_rev:
- commit_msg.append('Clang version changed %s:%s' %
- (clang_change.current_rev, clang_change.new_rev))
- change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
- CLANG_UPDATE_SCRIPT_URL_PATH)
- commit_msg.append('Details: %s\n' % change_url)
- else:
- commit_msg.append('No update to Clang.\n')
-
- # TBR needs to be non-empty for Gerrit to process it.
- git_author = _RunCommand(['git', 'config', 'user.email'],
- working_dir=CHECKOUT_SRC_DIR)[0].strip()
- commit_msg.append('TBR=%s' % git_author)
-
- commit_msg.append('BUG=None')
- return '\n'.join(commit_msg)
-
-
-def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision,
- changed_deps):
- """Update the DEPS file with the new revision."""
-
- # Update the chromium_revision variable.
- with open(deps_filename, 'rb') as deps_file:
- deps_content = deps_file.read()
- deps_content = deps_content.replace(old_cr_revision, new_cr_revision)
- with open(deps_filename, 'wb') as deps_file:
- deps_file.write(deps_content)
-
- # Update each individual DEPS entry.
- for dep in changed_deps:
- local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
- if not os.path.isdir(local_dep_dir):
- raise RollError(
- 'Cannot find local directory %s. Make sure the .gclient file\n'
- 'contains all platforms in the target_os list, i.e.\n'
- 'target_os = ["android", "unix", "mac", "ios", "win"];\n'
- 'Then run "gclient sync" again.' % local_dep_dir)
- _RunCommand(
- ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)],
- working_dir=CHECKOUT_SRC_DIR)
-
-
-def _IsTreeClean():
- stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
- if len(stdout) == 0:
- return True
-
- logging.error('Dirty/unversioned files:\n%s', stdout)
- return False
-
-
-def _EnsureUpdatedMasterBranch(dry_run):
- current_branch = _RunCommand(
- ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0]
- if current_branch != 'master':
- logging.error('Please checkout the master branch and re-run this script.')
- if not dry_run:
- sys.exit(-1)
-
- logging.info('Updating master branch...')
- _RunCommand(['git', 'pull'])
-
-
-def _CreateRollBranch(dry_run):
- logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
- if not dry_run:
- _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
-
-
-def _RemovePreviousRollBranch(dry_run):
- active_branch, branches = _GetBranches()
- if active_branch == ROLL_BRANCH_NAME:
- active_branch = 'master'
- if ROLL_BRANCH_NAME in branches:
- logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
- if not dry_run:
- _RunCommand(['git', 'checkout', active_branch])
- _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
-
-
-def _LocalCommit(commit_msg, dry_run):
- logging.info('Committing changes locally.')
- if not dry_run:
- _RunCommand(['git', 'add', '--update', '.'])
- _RunCommand(['git', 'commit', '-m', commit_msg])
-
-
-def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
- if skip_cq:
- return 0
- if (new_commit_pos - current_commit_pos) < cq_over:
- return 1
- return 2
-
-
-def _UploadCL(commit_queue_mode):
- """Upload the committed changes as a changelist to Gerrit.
-
- commit_queue_mode:
- - 2: Submit to commit queue.
- - 1: Run trybots but do not submit to CQ.
- - 0: Skip CQ, upload only.
- """
- cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
- if commit_queue_mode >= 2:
- logging.info('Sending the CL to the CQ...')
- cmd.extend(['--use-commit-queue'])
- elif commit_queue_mode >= 1:
- logging.info('Starting CQ dry run...')
- cmd.extend(['--cq-dry-run'])
- extra_env = {
- 'EDITOR': 'true',
- 'SKIP_GCE_AUTH_FOR_GIT': '1',
- }
- stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
- logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
- stdout, stderr)
-
-
-def main():
- p = argparse.ArgumentParser()
- p.add_argument('--clean', action='store_true', default=False,
- help='Removes any previous local roll branch.')
- p.add_argument('-r', '--revision',
- help=('Chromium Git revision to roll to. Defaults to the '
- 'Chromium HEAD revision if omitted.'))
- p.add_argument('--dry-run', action='store_true', default=False,
- help=('Calculate changes and modify DEPS, but don\'t create '
- 'any local branch, commit, upload CL or send any '
- 'tryjobs.'))
- p.add_argument('-i', '--ignore-unclean-workdir', action='store_true',
- default=False,
- help=('Ignore if the current branch is not master or if there '
- 'are uncommitted changes (default: %(default)s).'))
- grp = p.add_mutually_exclusive_group()
- grp.add_argument('--skip-cq', action='store_true', default=False,
- help='Skip sending the CL to the CQ (default: %(default)s)')
- grp.add_argument('--cq-over', type=int, default=1,
- help=('Commit queue dry run if the revision difference '
- 'is below this number (default: %(default)s)'))
- p.add_argument('-v', '--verbose', action='store_true', default=False,
- help='Be extra verbose in printing of log messages.')
- opts = p.parse_args()
-
- if opts.verbose:
- logging.basicConfig(level=logging.DEBUG)
- else:
- logging.basicConfig(level=logging.INFO)
-
- if not opts.ignore_unclean_workdir and not _IsTreeClean():
- logging.error('Please clean your local checkout first.')
- return 1
-
- if opts.clean:
- _RemovePreviousRollBranch(opts.dry_run)
-
- if not opts.ignore_unclean_workdir:
- _EnsureUpdatedMasterBranch(opts.dry_run)
-
- new_cr_rev = opts.revision
- if not new_cr_rev:
- stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
- head_rev = stdout.strip().split('\t')[0]
- logging.info('No revision specified. Using HEAD: %s', head_rev)
- new_cr_rev = head_rev
-
- deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
- libyuv_deps = ParseLocalDepsFile(deps_filename)
- current_cr_rev = libyuv_deps['vars']['chromium_revision']
-
- current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev))
- new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev))
-
- new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev)
- changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
- clang_change = CalculateChangedClang(new_cr_rev)
- commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev,
- current_commit_pos, new_commit_pos,
- changed_deps, clang_change)
- logging.debug('Commit message:\n%s', commit_msg)
-
- _CreateRollBranch(opts.dry_run)
- UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
- _LocalCommit(commit_msg, opts.dry_run)
- commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
- current_commit_pos, new_commit_pos)
- logging.info('Uploading CL...')
- if not opts.dry_run:
- _UploadCL(commit_queue_mode)
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/files/tools_libyuv/autoroller/unittests/.DS_Store b/files/tools_libyuv/autoroller/unittests/.DS_Store
deleted file mode 100644
index 70369d69..00000000
--- a/files/tools_libyuv/autoroller/unittests/.DS_Store
+++ /dev/null
Binary files differ
diff --git a/files/tools_libyuv/valgrind/chrome_tests.bat b/files/tools_libyuv/valgrind/chrome_tests.bat
deleted file mode 100755
index 9d4c8ca8..00000000
--- a/files/tools_libyuv/valgrind/chrome_tests.bat
+++ /dev/null
@@ -1,53 +0,0 @@
-@echo off
-:: Copyright (c) 2011 The Chromium Authors. All rights reserved.
-:: Use of this source code is governed by a BSD-style license that can be
-:: found in the LICENSE file.
-
-setlocal
-
-set THISDIR=%~dp0
-set TOOL_NAME="unknown"
-
-:: Get the tool name and put it into TOOL_NAME {{{1
-:: NB: SHIFT command doesn't modify %*
-:PARSE_ARGS_LOOP
- if %1 == () GOTO:TOOLNAME_NOT_FOUND
- if %1 == --tool GOTO:TOOLNAME_FOUND
- SHIFT
- goto :PARSE_ARGS_LOOP
-
-:TOOLNAME_NOT_FOUND
-echo "Please specify a tool (e.g. drmemory) by using --tool flag"
-exit /B 1
-
-:TOOLNAME_FOUND
-SHIFT
-set TOOL_NAME=%1
-:: }}}
-if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
-echo "Unknown tool: `%TOOL_NAME%`! Only drmemory is supported right now"
-exit /B 1
-
-:SETUP_DRMEMORY
-:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
-set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
-set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
-if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
-echo "Can't find Dr. Memory executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:DRMEMORY_BINARY_OK
-%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
-set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
-:: }}}
-goto :RUN_TESTS
-
-:RUN_TESTS
-set PYTHONPATH=%THISDIR%../python/google
-set RUNNING_ON_VALGRIND=yes
-python %THISDIR%/chrome_tests.py %*
diff --git a/files/tools_libyuv/valgrind/chrome_tests.py b/files/tools_libyuv/valgrind/chrome_tests.py
deleted file mode 100755
index fe899bce..00000000
--- a/files/tools_libyuv/valgrind/chrome_tests.py
+++ /dev/null
@@ -1,869 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-''' Runs various chrome tests through valgrind_test.py.'''
-
-import glob
-import logging
-import multiprocessing
-import optparse
-import os
-import stat
-import subprocess
-import sys
-
-import logging_utils
-import path_utils
-
-import common
-import valgrind_test
-
-class TestNotFound(Exception): pass
-
-class MultipleGTestFiltersSpecified(Exception): pass
-
-class BuildDirNotFound(Exception): pass
-
-class BuildDirAmbiguous(Exception): pass
-
-class ExecutableNotFound(Exception): pass
-
-class BadBinary(Exception): pass
-
-class ChromeTests:
- SLOW_TOOLS = ["memcheck", "drmemory"]
- LAYOUT_TESTS_DEFAULT_CHUNK_SIZE = 300
-
- def __init__(self, options, args, test):
- if ':' in test:
- (self._test, self._gtest_filter) = test.split(':', 1)
- else:
- self._test = test
- self._gtest_filter = options.gtest_filter
-
- if self._test not in self._test_list:
- raise TestNotFound("Unknown test: %s" % test)
-
- if options.gtest_filter and options.gtest_filter != self._gtest_filter:
- raise MultipleGTestFiltersSpecified("Can not specify both --gtest_filter "
- "and --test %s" % test)
-
- self._options = options
- self._args = args
-
- script_dir = path_utils.ScriptDir()
- # Compute the top of the tree (the "source dir") from the script dir (where
- # this script lives). We assume that the script dir is in tools/valgrind/
- # relative to the top of the tree.
- self._source_dir = os.path.dirname(os.path.dirname(script_dir))
- # since this path is used for string matching, make sure it's always
- # an absolute Unix-style path
- self._source_dir = os.path.abspath(self._source_dir).replace('\\', '/')
- valgrind_test_script = os.path.join(script_dir, "valgrind_test.py")
- self._command_preamble = ["--source-dir=%s" % (self._source_dir)]
-
- if not self._options.build_dir:
- dirs = [
- os.path.join(self._source_dir, "xcodebuild", "Debug"),
- os.path.join(self._source_dir, "out", "Debug"),
- os.path.join(self._source_dir, "build", "Debug"),
- ]
- build_dir = [d for d in dirs if os.path.isdir(d)]
- if len(build_dir) > 1:
- raise BuildDirAmbiguous("Found more than one suitable build dir:\n"
- "%s\nPlease specify just one "
- "using --build-dir" % ", ".join(build_dir))
- elif build_dir:
- self._options.build_dir = build_dir[0]
- else:
- self._options.build_dir = None
-
- if self._options.build_dir:
- build_dir = os.path.abspath(self._options.build_dir)
- self._command_preamble += ["--build-dir=%s" % (self._options.build_dir)]
-
- def _EnsureBuildDirFound(self):
- if not self._options.build_dir:
- raise BuildDirNotFound("Oops, couldn't find a build dir, please "
- "specify it manually using --build-dir")
-
- def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
- '''Generates the default command array that most tests will use.'''
- if exe and common.IsWindows():
- exe += '.exe'
-
- cmd = list(self._command_preamble)
-
- # Find all suppressions matching the following pattern:
- # tools/valgrind/TOOL/suppressions[_PLATFORM].txt
- # and list them with --suppressions= prefix.
- script_dir = path_utils.ScriptDir()
- tool_name = tool.ToolName();
- suppression_file = os.path.join(script_dir, tool_name, "suppressions.txt")
- if os.path.exists(suppression_file):
- cmd.append("--suppressions=%s" % suppression_file)
- # Platform-specific suppression
- for platform in common.PlatformNames():
- platform_suppression_file = \
- os.path.join(script_dir, tool_name, 'suppressions_%s.txt' % platform)
- if os.path.exists(platform_suppression_file):
- cmd.append("--suppressions=%s" % platform_suppression_file)
-
- if tool_name == "drmemory":
- if self._options.drmemory_ops:
- # prepending " " to avoid Dr. Memory's option confusing optparse
- cmd += ["--drmemory_ops", " " + self._options.drmemory_ops]
-
- if self._options.valgrind_tool_flags:
- cmd += self._options.valgrind_tool_flags.split(" ")
- if self._options.keep_logs:
- cmd += ["--keep_logs"]
- if valgrind_test_args != None:
- for arg in valgrind_test_args:
- cmd.append(arg)
- if exe:
- self._EnsureBuildDirFound()
- exe_path = os.path.join(self._options.build_dir, exe)
- if not os.path.exists(exe_path):
- raise ExecutableNotFound("Couldn't find '%s'" % exe_path)
-
- # Make sure we don't try to test ASan-built binaries
- # with other dynamic instrumentation-based tools.
- # TODO(timurrrr): also check TSan and MSan?
- # `nm` might not be available, so use try-except.
- try:
- # Do not perform this check on OS X, as 'nm' on 10.6 can't handle
- # binaries built with Clang 3.5+.
- if not common.IsMac():
- nm_output = subprocess.check_output(["nm", exe_path])
- if nm_output.find("__asan_init") != -1:
- raise BadBinary("You're trying to run an executable instrumented "
- "with AddressSanitizer under %s. Please provide "
- "an uninstrumented executable." % tool_name)
- except OSError:
- pass
-
- cmd.append(exe_path)
- # Valgrind runs tests slowly, so slow tests hurt more; show elapased time
- # so we can find the slowpokes.
- cmd.append("--gtest_print_time")
- # Built-in test launcher for gtest-based executables runs tests using
- # multiple process by default. Force the single-process mode back.
- cmd.append("--single-process-tests")
- if self._options.gtest_repeat:
- cmd.append("--gtest_repeat=%s" % self._options.gtest_repeat)
- if self._options.gtest_shuffle:
- cmd.append("--gtest_shuffle")
- if self._options.gtest_break_on_failure:
- cmd.append("--gtest_break_on_failure")
- if self._options.test_launcher_bot_mode:
- cmd.append("--test-launcher-bot-mode")
- if self._options.test_launcher_total_shards is not None:
- cmd.append("--test-launcher-total-shards=%d"
- % self._options.test_launcher_total_shards)
- if self._options.test_launcher_shard_index is not None:
- cmd.append("--test-launcher-shard-index=%d"
- % self._options.test_launcher_shard_index)
- return cmd
-
- def Run(self):
- ''' Runs the test specified by command-line argument --test '''
- logging.info("running test %s" % (self._test))
- return self._test_list[self._test](self)
-
- def _AppendGtestFilter(self, tool, name, cmd):
- '''Append an appropriate --gtest_filter flag to the googletest binary
- invocation.
- If the user passed their own filter mentioning only one test, just use
- it. Otherwise, filter out tests listed in the appropriate gtest_exclude
- files.
- '''
- if (self._gtest_filter and
- ":" not in self._gtest_filter and
- "?" not in self._gtest_filter and
- "*" not in self._gtest_filter):
- cmd.append("--gtest_filter=%s" % self._gtest_filter)
- return
-
- filters = []
- gtest_files_dir = os.path.join(path_utils.ScriptDir(), "gtest_exclude")
-
- gtest_filter_files = [
- os.path.join(gtest_files_dir, name + ".gtest-%s.txt" % tool.ToolName())]
- # Use ".gtest.txt" files only for slow tools, as they now contain
- # Valgrind- and Dr.Memory-specific filters.
- # TODO(glider): rename the files to ".gtest_slow.txt"
- if tool.ToolName() in ChromeTests.SLOW_TOOLS:
- gtest_filter_files += [os.path.join(gtest_files_dir, name + ".gtest.txt")]
- for platform_suffix in common.PlatformNames():
- gtest_filter_files += [
- os.path.join(gtest_files_dir, name + ".gtest_%s.txt" % platform_suffix),
- os.path.join(gtest_files_dir, name + ".gtest-%s_%s.txt" % \
- (tool.ToolName(), platform_suffix))]
- logging.info("Reading gtest exclude filter files:")
- for filename in gtest_filter_files:
- # strip the leading absolute path (may be very long on the bot)
- # and the following / or \.
- readable_filename = filename.replace("\\", "/") # '\' on Windows
- readable_filename = readable_filename.replace(self._source_dir, "")[1:]
- if not os.path.exists(filename):
- logging.info(" \"%s\" - not found" % readable_filename)
- continue
- logging.info(" \"%s\" - OK" % readable_filename)
- f = open(filename, 'r')
- for line in f.readlines():
- if line.startswith("#") or line.startswith("//") or line.isspace():
- continue
- line = line.rstrip()
- test_prefixes = ["FLAKY", "FAILS"]
- for p in test_prefixes:
- # Strip prefixes from the test names.
- line = line.replace(".%s_" % p, ".")
- # Exclude the original test name.
- filters.append(line)
- if line[-2:] != ".*":
- # List all possible prefixes if line doesn't end with ".*".
- for p in test_prefixes:
- filters.append(line.replace(".", ".%s_" % p))
- # Get rid of duplicates.
- filters = set(filters)
- gtest_filter = self._gtest_filter
- if len(filters):
- if gtest_filter:
- gtest_filter += ":"
- if gtest_filter.find("-") < 0:
- gtest_filter += "-"
- else:
- gtest_filter = "-"
- gtest_filter += ":".join(filters)
- if gtest_filter:
- cmd.append("--gtest_filter=%s" % gtest_filter)
-
- @staticmethod
- def ShowTests():
- test_to_names = {}
- for name, test_function in ChromeTests._test_list.iteritems():
- test_to_names.setdefault(test_function, []).append(name)
-
- name_to_aliases = {}
- for names in test_to_names.itervalues():
- names.sort(key=lambda name: len(name))
- name_to_aliases[names[0]] = names[1:]
-
- print
- print "Available tests:"
- print "----------------"
- for name, aliases in sorted(name_to_aliases.iteritems()):
- if aliases:
- print " {} (aka {})".format(name, ', '.join(aliases))
- else:
- print " {}".format(name)
-
- def SetupLdPath(self, requires_build_dir):
- if requires_build_dir:
- self._EnsureBuildDirFound()
- elif not self._options.build_dir:
- return
-
- # Append build_dir to LD_LIBRARY_PATH so external libraries can be loaded.
- if (os.getenv("LD_LIBRARY_PATH")):
- os.putenv("LD_LIBRARY_PATH", "%s:%s" % (os.getenv("LD_LIBRARY_PATH"),
- self._options.build_dir))
- else:
- os.putenv("LD_LIBRARY_PATH", self._options.build_dir)
-
- def SimpleTest(self, module, name, valgrind_test_args=None, cmd_args=None):
- tool = valgrind_test.CreateTool(self._options.valgrind_tool)
- cmd = self._DefaultCommand(tool, name, valgrind_test_args)
- self._AppendGtestFilter(tool, name, cmd)
- cmd.extend(['--test-tiny-timeout=1000'])
- if cmd_args:
- cmd.extend(cmd_args)
-
- self.SetupLdPath(True)
- return tool.Run(cmd, module)
-
- def RunCmdLine(self):
- tool = valgrind_test.CreateTool(self._options.valgrind_tool)
- cmd = self._DefaultCommand(tool, None, self._args)
- self.SetupLdPath(False)
- return tool.Run(cmd, None)
-
- def TestAccessibility(self):
- return self.SimpleTest("accessibility", "accessibility_unittests")
-
- def TestAddressInput(self):
- return self.SimpleTest("addressinput", "libaddressinput_unittests")
-
- def TestAngle(self):
- return self.SimpleTest("angle", "angle_unittests")
-
- def TestAppList(self):
- return self.SimpleTest("app_list", "app_list_unittests")
-
- def TestAsh(self):
- return self.SimpleTest("ash", "ash_unittests")
-
- def TestAura(self):
- return self.SimpleTest("aura", "aura_unittests")
-
- def TestBase(self):
- return self.SimpleTest("base", "base_unittests")
-
- def TestBlinkHeap(self):
- return self.SimpleTest("blink_heap", "blink_heap_unittests")
-
- def TestBlinkPlatform(self):
- return self.SimpleTest("blink_platform", "blink_platform_unittests")
-
- def TestCacheInvalidation(self):
- return self.SimpleTest("cacheinvalidation", "cacheinvalidation_unittests")
-
- def TestCast(self):
- return self.SimpleTest("chrome", "cast_unittests")
-
- def TestCC(self):
- return self.SimpleTest("cc", "cc_unittests",
- cmd_args=[
- "--cc-layer-tree-test-long-timeout"])
-
- def TestChromeApp(self):
- return self.SimpleTest("chrome_app", "chrome_app_unittests")
-
- def TestChromeElf(self):
- return self.SimpleTest("chrome_elf", "chrome_elf_unittests")
-
- def TestChromeDriver(self):
- return self.SimpleTest("chromedriver", "chromedriver_unittests")
-
- def TestChromeOS(self):
- return self.SimpleTest("chromeos", "chromeos_unittests")
-
- def TestComponents(self):
- return self.SimpleTest("components", "components_unittests")
-
- def TestCompositor(self):
- return self.SimpleTest("compositor", "compositor_unittests")
-
- def TestContent(self):
- return self.SimpleTest("content", "content_unittests")
-
- def TestCourgette(self):
- return self.SimpleTest("courgette", "courgette_unittests")
-
- def TestCrypto(self):
- return self.SimpleTest("crypto", "crypto_unittests")
-
- def TestDevice(self):
- return self.SimpleTest("device", "device_unittests")
-
- def TestDisplay(self):
- return self.SimpleTest("display", "display_unittests")
-
- def TestEvents(self):
- return self.SimpleTest("events", "events_unittests")
-
- def TestExtensions(self):
- return self.SimpleTest("extensions", "extensions_unittests")
-
- def TestFFmpegRegressions(self):
- return self.SimpleTest("chrome", "ffmpeg_regression_tests")
-
- def TestGCM(self):
- return self.SimpleTest("gcm", "gcm_unit_tests")
-
- def TestGfx(self):
- return self.SimpleTest("gfx", "gfx_unittests")
-
- def TestGin(self):
- return self.SimpleTest("gin", "gin_unittests")
-
- def TestGoogleApis(self):
- return self.SimpleTest("google_apis", "google_apis_unittests")
-
- def TestGPU(self):
- return self.SimpleTest("gpu", "gpu_unittests")
-
- def TestIpc(self):
- return self.SimpleTest("ipc", "ipc_tests",
- valgrind_test_args=["--trace_children"])
-
- def TestInstallerUtil(self):
- return self.SimpleTest("installer_util", "installer_util_unittests")
-
- def TestInstallStatic(self):
- return self.SimpleTest("install_static", "install_static_unittests")
-
- def TestJingle(self):
- return self.SimpleTest("chrome", "jingle_unittests")
-
- def TestKeyboard(self):
- return self.SimpleTest("keyboard", "keyboard_unittests")
-
- def TestLatency(self):
- return self.SimpleTest("latency", "latency_unittests")
-
- def TestMedia(self):
- return self.SimpleTest("chrome", "media_unittests")
-
- def TestMessageCenter(self):
- return self.SimpleTest("message_center", "message_center_unittests")
-
- def TestMidi(self):
- return self.SimpleTest("chrome", "midi_unittests")
-
- def TestMojoCommon(self):
- return self.SimpleTest("mojo_common", "mojo_common_unittests")
-
- def TestMojoPublicBindings(self):
- return self.SimpleTest("mojo_public_bindings",
- "mojo_public_bindings_unittests")
-
- def TestMojoPublicSystem(self):
- return self.SimpleTest("mojo_public_system",
- "mojo_public_system_unittests")
-
- def TestMojoPublicSysPerf(self):
- return self.SimpleTest("mojo_public_sysperf",
- "mojo_public_system_perftests")
-
- def TestMojoSystem(self):
- return self.SimpleTest("mojo_system", "mojo_system_unittests")
-
- def TestNet(self):
- return self.SimpleTest("net", "net_unittests")
-
- def TestNetPerf(self):
- return self.SimpleTest("net", "net_perftests")
-
- def TestPhoneNumber(self):
- return self.SimpleTest("phonenumber", "libphonenumber_unittests")
-
- def TestPPAPI(self):
- return self.SimpleTest("chrome", "ppapi_unittests")
-
- def TestPrinting(self):
- return self.SimpleTest("chrome", "printing_unittests")
-
- def TestRemoting(self):
- return self.SimpleTest("chrome", "remoting_unittests",
- cmd_args=[
- "--ui-test-action-timeout=60000",
- "--ui-test-action-max-timeout=150000"])
-
- def TestSkia(self):
- return self.SimpleTest("skia", "skia_unittests")
-
- def TestSql(self):
- return self.SimpleTest("chrome", "sql_unittests")
-
- def TestStorage(self):
- return self.SimpleTest("storage", "storage_unittests")
-
- def TestLinuxSandbox(self):
- return self.SimpleTest("sandbox", "sandbox_linux_unittests")
-
- def TestUnit(self):
- # http://crbug.com/51716
- # Disabling all unit tests
- # Problems reappeared after r119922
- if common.IsMac() and (self._options.valgrind_tool == "memcheck"):
- logging.warning("unit_tests are disabled for memcheck on MacOS.")
- return 0;
- return self.SimpleTest("chrome", "unit_tests")
-
- def TestUIBaseUnit(self):
- return self.SimpleTest("chrome", "ui_base_unittests")
-
- def TestUIChromeOS(self):
- return self.SimpleTest("chrome", "ui_chromeos_unittests")
-
- def TestURL(self):
- return self.SimpleTest("chrome", "url_unittests")
-
- def TestViews(self):
- return self.SimpleTest("views", "views_unittests")
-
-
- # Valgrind timeouts are in seconds.
- UI_VALGRIND_ARGS = ["--timeout=14400", "--trace_children", "--indirect"]
- # UI test timeouts are in milliseconds.
- UI_TEST_ARGS = ["--ui-test-action-timeout=60000",
- "--ui-test-action-max-timeout=150000",
- "--no-sandbox"]
-
- # TODO(thestig) fine-tune these values.
- # Valgrind timeouts are in seconds.
- BROWSER_VALGRIND_ARGS = ["--timeout=50000", "--trace_children", "--indirect"]
- # Browser test timeouts are in milliseconds.
- BROWSER_TEST_ARGS = ["--ui-test-action-timeout=400000",
- "--ui-test-action-max-timeout=800000",
- "--no-sandbox"]
-
- def TestBrowser(self):
- return self.SimpleTest("chrome", "browser_tests",
- valgrind_test_args=self.BROWSER_VALGRIND_ARGS,
- cmd_args=self.BROWSER_TEST_ARGS)
-
- def TestContentBrowser(self):
- return self.SimpleTest("content", "content_browsertests",
- valgrind_test_args=self.BROWSER_VALGRIND_ARGS,
- cmd_args=self.BROWSER_TEST_ARGS)
-
- def TestInteractiveUI(self):
- return self.SimpleTest("chrome", "interactive_ui_tests",
- valgrind_test_args=self.UI_VALGRIND_ARGS,
- cmd_args=self.UI_TEST_ARGS)
-
- def TestSyncIntegration(self):
- return self.SimpleTest("chrome", "sync_integration_tests",
- valgrind_test_args=self.UI_VALGRIND_ARGS,
- cmd_args=(["--ui-test-action-max-timeout=450000"]))
-
- def TestLayoutChunk(self, chunk_num, chunk_size):
- # Run tests [chunk_num*chunk_size .. (chunk_num+1)*chunk_size) from the
- # list of tests. Wrap around to beginning of list at end.
- # If chunk_size is zero, run all tests in the list once.
- # If a text file is given as argument, it is used as the list of tests.
- assert((chunk_size == 0) != (len(self._args) == 0))
- # Build the ginormous commandline in 'cmd'.
- # It's going to be roughly
- # python valgrind_test.py ...
- # but we'll use the --indirect flag to valgrind_test.py
- # to avoid valgrinding python.
- # Start by building the valgrind_test.py commandline.
- tool = valgrind_test.CreateTool(self._options.valgrind_tool)
- cmd = self._DefaultCommand(tool)
- cmd.append("--trace_children")
- cmd.append("--indirect_webkit_layout")
- cmd.append("--ignore_exit_code")
- # Now build script_cmd, the run-webkits-tests commandline.
- # Store each chunk in its own directory so that we can find the data later
- chunk_dir = os.path.join("layout", "chunk_%05d" % chunk_num)
- out_dir = os.path.join(path_utils.ScriptDir(), "latest")
- out_dir = os.path.join(out_dir, chunk_dir)
- if os.path.exists(out_dir):
- old_files = glob.glob(os.path.join(out_dir, "*.txt"))
- for f in old_files:
- os.remove(f)
- else:
- os.makedirs(out_dir)
- script = os.path.join(self._source_dir, "third_party", "WebKit", "Tools",
- "Scripts", "run-webkit-tests")
- # http://crbug.com/260627: After the switch to content_shell from DRT, each
- # test now brings up 3 processes. Under Valgrind, they become memory bound
- # and can eventually OOM if we don't reduce the total count.
- # It'd be nice if content_shell automatically throttled the startup of new
- # tests if we're low on memory.
- jobs = max(1, int(multiprocessing.cpu_count() * 0.3))
- script_cmd = ["python", script, "-v",
- # run a separate DumpRenderTree for each test
- "--batch-size=1",
- "--fully-parallel",
- "--child-processes=%d" % jobs,
- "--time-out-ms=800000",
- "--no-retry-failures", # retrying takes too much time
- # http://crbug.com/176908: Don't launch a browser when done.
- "--no-show-results",
- "--nocheck-sys-deps",
- "--additional-driver-flag=--no-sandbox"]
- # Pass build mode to run-webkit-tests. We aren't passed it directly,
- # so parse it out of build_dir. run-webkit-tests can only handle
- # the two values "Release" and "Debug".
- # TODO(Hercules): unify how all our scripts pass around build mode
- # (--mode / --target / --build-dir / --debug)
- if self._options.build_dir:
- build_root, mode = os.path.split(self._options.build_dir)
- script_cmd.extend(["--build-directory", build_root, "--target", mode])
- if (chunk_size > 0):
- script_cmd.append("--run-chunk=%d:%d" % (chunk_num, chunk_size))
- if len(self._args):
- # if the arg is a txt file, then treat it as a list of tests
- if os.path.isfile(self._args[0]) and self._args[0][-4:] == ".txt":
- script_cmd.append("--test-list=%s" % self._args[0])
- else:
- script_cmd.extend(self._args)
- self._AppendGtestFilter(tool, "layout", script_cmd)
- # Now run script_cmd with the wrapper in cmd
- cmd.extend(["--"])
- cmd.extend(script_cmd)
-
- # Layout tests often times fail quickly, but the buildbot remains green.
- # Detect this situation when running with the default chunk size.
- if chunk_size == self.LAYOUT_TESTS_DEFAULT_CHUNK_SIZE:
- min_runtime_in_seconds=120
- else:
- min_runtime_in_seconds=0
- ret = tool.Run(cmd, "layout", min_runtime_in_seconds=min_runtime_in_seconds)
- return ret
-
-
- def TestLayout(self):
- # A "chunk file" is maintained in the local directory so that each test
- # runs a slice of the layout tests of size chunk_size that increments with
- # each run. Since tests can be added and removed from the layout tests at
- # any time, this is not going to give exact coverage, but it will allow us
- # to continuously run small slices of the layout tests under valgrind rather
- # than having to run all of them in one shot.
- chunk_size = self._options.num_tests
- if chunk_size == 0 or len(self._args):
- return self.TestLayoutChunk(0, 0)
- chunk_num = 0
- chunk_file = os.path.join("valgrind_layout_chunk.txt")
- logging.info("Reading state from " + chunk_file)
- try:
- f = open(chunk_file)
- if f:
- chunk_str = f.read()
- if len(chunk_str):
- chunk_num = int(chunk_str)
- # This should be enough so that we have a couple of complete runs
- # of test data stored in the archive (although note that when we loop
- # that we almost guaranteed won't be at the end of the test list)
- if chunk_num > 10000:
- chunk_num = 0
- f.close()
- except IOError, (errno, strerror):
- logging.error("error reading from file %s (%d, %s)" % (chunk_file,
- errno, strerror))
- # Save the new chunk size before running the tests. Otherwise if a
- # particular chunk hangs the bot, the chunk number will never get
- # incremented and the bot will be wedged.
- logging.info("Saving state to " + chunk_file)
- try:
- f = open(chunk_file, "w")
- chunk_num += 1
- f.write("%d" % chunk_num)
- f.close()
- except IOError, (errno, strerror):
- logging.error("error writing to file %s (%d, %s)" % (chunk_file, errno,
- strerror))
- # Since we're running small chunks of the layout tests, it's important to
- # mark the ones that have errors in them. These won't be visible in the
- # summary list for long, but will be useful for someone reviewing this bot.
- return self.TestLayoutChunk(chunk_num, chunk_size)
-
- # The known list of tests.
- # Recognise the original abbreviations as well as full executable names.
- _test_list = {
- "cmdline" : RunCmdLine,
- "addressinput": TestAddressInput,
- "libaddressinput_unittests": TestAddressInput,
- "accessibility": TestAccessibility,
- "angle": TestAngle, "angle_unittests": TestAngle,
- "app_list": TestAppList, "app_list_unittests": TestAppList,
- "ash": TestAsh, "ash_unittests": TestAsh,
- "aura": TestAura, "aura_unittests": TestAura,
- "base": TestBase, "base_unittests": TestBase,
- "blink_heap": TestBlinkHeap,
- "blink_platform": TestBlinkPlatform,
- "browser": TestBrowser, "browser_tests": TestBrowser,
- "cacheinvalidation": TestCacheInvalidation,
- "cacheinvalidation_unittests": TestCacheInvalidation,
- "cast": TestCast, "cast_unittests": TestCast,
- "cc": TestCC, "cc_unittests": TestCC,
- "chrome_app": TestChromeApp,
- "chrome_elf": TestChromeElf,
- "chromedriver": TestChromeDriver,
- "chromeos": TestChromeOS, "chromeos_unittests": TestChromeOS,
- "components": TestComponents,"components_unittests": TestComponents,
- "compositor": TestCompositor,"compositor_unittests": TestCompositor,
- "content": TestContent, "content_unittests": TestContent,
- "content_browsertests": TestContentBrowser,
- "courgette": TestCourgette, "courgette_unittests": TestCourgette,
- "crypto": TestCrypto, "crypto_unittests": TestCrypto,
- "device": TestDevice, "device_unittests": TestDevice,
- "display": TestDisplay, "display_unittests": TestDisplay,
- "events": TestEvents, "events_unittests": TestEvents,
- "extensions": TestExtensions, "extensions_unittests": TestExtensions,
- "ffmpeg_regression_tests": TestFFmpegRegressions,
- "gcm": TestGCM, "gcm_unit_tests": TestGCM,
- "gin": TestGin, "gin_unittests": TestGin,
- "gfx": TestGfx, "gfx_unittests": TestGfx,
- "google_apis": TestGoogleApis,
- "gpu": TestGPU, "gpu_unittests": TestGPU,
- "ipc": TestIpc, "ipc_tests": TestIpc,
- "installer_util": TestInstallerUtil,
- "installer_util_unittests": TestInstallerUtil,
- "install_static_unittests": TestInstallStatic,
- "interactive_ui": TestInteractiveUI,
- "jingle": TestJingle, "jingle_unittests": TestJingle,
- "keyboard": TestKeyboard, "keyboard_unittests": TestKeyboard,
- "latency": TestLatency, "latency_unittests": TestLatency,
- "layout": TestLayout, "layout_tests": TestLayout,
- "media": TestMedia, "media_unittests": TestMedia,
- "message_center": TestMessageCenter,
- "message_center_unittests" : TestMessageCenter,
- "midi": TestMidi, "midi_unittests": TestMidi,
- "mojo_common": TestMojoCommon,
- "mojo_common_unittests": TestMojoCommon,
- "mojo_system": TestMojoSystem,
- "mojo_system_unittests": TestMojoSystem,
- "mojo_public_system": TestMojoPublicSystem,
- "mojo_public_system_unittests": TestMojoPublicSystem,
- "mojo_public_bindings": TestMojoPublicBindings,
- "mojo_public_bindings_unittests": TestMojoPublicBindings,
- "mojo_public_sysperf": TestMojoPublicSysPerf,
- "net": TestNet, "net_unittests": TestNet,
- "net_perf": TestNetPerf, "net_perftests": TestNetPerf,
- "phonenumber": TestPhoneNumber,
- "libphonenumber_unittests": TestPhoneNumber,
- "ppapi": TestPPAPI, "ppapi_unittests": TestPPAPI,
- "printing": TestPrinting, "printing_unittests": TestPrinting,
- "remoting": TestRemoting, "remoting_unittests": TestRemoting,
- "sandbox": TestLinuxSandbox, "sandbox_linux_unittests": TestLinuxSandbox,
- "skia": TestSkia, "skia_unittests": TestSkia,
- "sql": TestSql, "sql_unittests": TestSql,
- "storage": TestStorage, "storage_unittests": TestStorage,
- "sync_integration_tests": TestSyncIntegration,
- "sync_integration": TestSyncIntegration,
- "ui_base_unit": TestUIBaseUnit, "ui_base_unittests": TestUIBaseUnit,
- "ui_chromeos": TestUIChromeOS, "ui_chromeos_unittests": TestUIChromeOS,
- "unit": TestUnit, "unit_tests": TestUnit,
- "url": TestURL, "url_unittests": TestURL,
- "views": TestViews, "views_unittests": TestViews,
- "webkit": TestLayout,
- }
-
-
-def _main():
- parser = optparse.OptionParser("usage: %prog -b <dir> -t <test> "
- "[-t <test> ...]")
-
- parser.add_option("--help-tests", dest="help_tests", action="store_true",
- default=False, help="List all available tests")
- parser.add_option("-b", "--build-dir",
- help="the location of the compiler output")
- parser.add_option("--target", help="Debug or Release")
- parser.add_option("-t", "--test", action="append", default=[],
- help="which test to run, supports test:gtest_filter format "
- "as well.")
- parser.add_option("--baseline", action="store_true", default=False,
- help="generate baseline data instead of validating")
- parser.add_option("-f", "--force", action="store_true", default=False,
- help="run a broken test anyway")
- parser.add_option("--gtest_filter",
- help="additional arguments to --gtest_filter")
- parser.add_option("--gtest_repeat", help="argument for --gtest_repeat")
- parser.add_option("--gtest_shuffle", action="store_true", default=False,
- help="Randomize tests' orders on every iteration.")
- parser.add_option("--gtest_break_on_failure", action="store_true",
- default=False,
- help="Drop in to debugger on assertion failure. Also "
- "useful for forcing tests to exit with a stack dump "
- "on the first assertion failure when running with "
- "--gtest_repeat=-1")
- parser.add_option("-v", "--verbose", action="store_true", default=False,
- help="verbose output - enable debug log messages")
- parser.add_option("--tool", dest="valgrind_tool", default="memcheck",
- help="specify a valgrind tool to run the tests under")
- parser.add_option("--tool_flags", dest="valgrind_tool_flags", default="",
- help="specify custom flags for the selected valgrind tool")
- parser.add_option("--keep_logs", action="store_true", default=False,
- help="store memory tool logs in the <tool>.logs directory "
- "instead of /tmp.\nThis can be useful for tool "
- "developers/maintainers.\nPlease note that the <tool>"
- ".logs directory will be clobbered on tool startup.")
- parser.add_option("-n", "--num_tests", type="int",
- default=ChromeTests.LAYOUT_TESTS_DEFAULT_CHUNK_SIZE,
- help="for layout tests: # of subtests per run. 0 for all.")
- parser.add_option("--test-launcher-bot-mode", action="store_true",
- help="run the tests with --test-launcher-bot-mode")
- parser.add_option("--test-launcher-total-shards", type=int,
- help="run the tests with --test-launcher-total-shards")
- parser.add_option("--test-launcher-shard-index", type=int,
- help="run the tests with --test-launcher-shard-index")
- parser.add_option("--drmemory_ops",
- help="extra options passed to Dr. Memory")
-
- options, args = parser.parse_args()
-
- # Bake target into build_dir.
- if options.target and options.build_dir:
- assert (options.target !=
- os.path.basename(os.path.dirname(options.build_dir)))
- options.build_dir = os.path.join(os.path.abspath(options.build_dir),
- options.target)
-
- if options.verbose:
- logging_utils.config_root(logging.DEBUG)
- else:
- logging_utils.config_root()
-
- if options.help_tests:
- ChromeTests.ShowTests()
- return 0
-
- if not options.test:
- parser.error("--test not specified")
-
- if len(options.test) != 1 and options.gtest_filter:
- parser.error("--gtest_filter and multiple tests don't make sense together")
-
- BROKEN_TESTS = {
- 'drmemory_light': [
- 'addressinput',
- 'aura',
- 'base_unittests',
- 'cc',
- 'components', # x64 only?
- 'content',
- 'gfx',
- 'mojo_public_bindings',
- ],
- 'drmemory_full': [
- 'addressinput',
- 'aura',
- 'base_unittests',
- 'blink_heap',
- 'blink_platform',
- 'browser_tests',
- 'cast',
- 'cc',
- 'chromedriver',
- 'compositor',
- 'content',
- 'content_browsertests',
- 'device',
- 'events',
- 'extensions',
- 'gfx',
- 'google_apis',
- 'gpu',
- 'ipc_tests',
- 'jingle',
- 'keyboard',
- 'media',
- 'midi',
- 'mojo_common',
- 'mojo_public_bindings',
- 'mojo_public_sysperf',
- 'mojo_public_system',
- 'mojo_system',
- 'net',
- 'remoting',
- 'unit',
- 'url',
- ],
- }
-
- for t in options.test:
- if t in BROKEN_TESTS[options.valgrind_tool] and not options.force:
- logging.info("Skipping broken %s test %s -- see crbug.com/633693" %
- (options.valgrind_tool, t))
- return 0
-
- tests = ChromeTests(options, args, t)
- ret = tests.Run()
- if ret: return ret
- return 0
-
-
-if __name__ == "__main__":
- sys.exit(_main())
diff --git a/files/tools_libyuv/valgrind/chrome_tests.sh b/files/tools_libyuv/valgrind/chrome_tests.sh
deleted file mode 100755
index dc17684f..00000000
--- a/files/tools_libyuv/valgrind/chrome_tests.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Set up some paths and re-direct the arguments to chrome_tests.py
-
-export THISDIR=`dirname $0`
-ARGV_COPY="$@"
-
-# We need to set CHROME_VALGRIND iff using Memcheck:
-# tools/valgrind/chrome_tests.sh --tool memcheck
-# or
-# tools/valgrind/chrome_tests.sh --tool=memcheck
-tool="memcheck" # Default to memcheck.
-while (( "$#" ))
-do
- if [[ "$1" == "--tool" ]]
- then
- tool="$2"
- shift
- elif [[ "$1" =~ --tool=(.*) ]]
- then
- tool="${BASH_REMATCH[1]}"
- fi
- shift
-done
-
-NEEDS_VALGRIND=0
-NEEDS_DRMEMORY=0
-
-case "$tool" in
- "memcheck")
- NEEDS_VALGRIND=1
- ;;
- "drmemory" | "drmemory_light" | "drmemory_full" | "drmemory_pattern")
- NEEDS_DRMEMORY=1
- ;;
-esac
-
-if [ "$NEEDS_VALGRIND" == "1" ]
-then
- export CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
- if [ "$CHROME_VALGRIND" = "" ]
- then
- # locate_valgrind.sh failed
- exit 1
- fi
- echo "Using valgrind binaries from ${CHROME_VALGRIND}"
-
- PATH="${CHROME_VALGRIND}/bin:$PATH"
- # We need to set these variables to override default lib paths hard-coded into
- # Valgrind binary.
- export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
- export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
-
- # Clean up some /tmp directories that might be stale due to interrupted
- # chrome_tests.py execution.
- # FYI:
- # -mtime +1 <- only print files modified more than 24h ago,
- # -print0/-0 are needed to handle possible newlines in the filenames.
- echo "Cleanup /tmp from Valgrind stuff"
- find /tmp -maxdepth 1 \(\
- -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
- \) -mtime +1 -print0 | xargs -0 rm -rf
-fi
-
-if [ "$NEEDS_DRMEMORY" == "1" ]
-then
- if [ -z "$DRMEMORY_COMMAND" ]
- then
- DRMEMORY_PATH="$THISDIR/../../third_party/drmemory"
- DRMEMORY_SFX="$DRMEMORY_PATH/drmemory-windows-sfx.exe"
- if [ ! -f "$DRMEMORY_SFX" ]
- then
- echo "Can't find Dr. Memory executables."
- echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
- echo "for the instructions on how to get them."
- exit 1
- fi
-
- chmod +x "$DRMEMORY_SFX" # Cygwin won't run it without +x.
- "$DRMEMORY_SFX" -o"$DRMEMORY_PATH/unpacked" -y
- export DRMEMORY_COMMAND="$DRMEMORY_PATH/unpacked/bin/drmemory.exe"
- fi
-fi
-
-PYTHONPATH=$THISDIR/../python/google python \
- "$THISDIR/chrome_tests.py" $ARGV_COPY
diff --git a/files/tools_libyuv/valgrind/common.py b/files/tools_libyuv/valgrind/common.py
deleted file mode 100644
index e9ee51e4..00000000
--- a/files/tools_libyuv/valgrind/common.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-import logging
-import platform
-import os
-import signal
-import subprocess
-import sys
-import time
-
-
-class NotImplementedError(Exception):
- pass
-
-
-class TimeoutError(Exception):
- pass
-
-
-def RunSubprocessInBackground(proc):
- """Runs a subprocess in the background. Returns a handle to the process."""
- logging.info("running %s in the background" % " ".join(proc))
- return subprocess.Popen(proc)
-
-
-def RunSubprocess(proc, timeout=0):
- """ Runs a subprocess, until it finishes or |timeout| is exceeded and the
- process is killed with taskkill. A |timeout| <= 0 means no timeout.
-
- Args:
- proc: list of process components (exe + args)
- timeout: how long to wait before killing, <= 0 means wait forever
- """
-
- logging.info("running %s, timeout %d sec" % (" ".join(proc), timeout))
- sys.stdout.flush()
- sys.stderr.flush()
-
- # Manually read and print out stdout and stderr.
- # By default, the subprocess is supposed to inherit these from its parent,
- # however when run under buildbot, it seems unable to read data from a
- # grandchild process, so we have to read the child and print the data as if
- # it came from us for buildbot to read it. We're not sure why this is
- # necessary.
- # TODO(erikkay): should we buffer stderr and stdout separately?
- p = subprocess.Popen(proc, universal_newlines=True,
- bufsize=0, # unbuffered
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-
- logging.info("started subprocess")
-
- did_timeout = False
- if timeout > 0:
- wait_until = time.time() + timeout
- while p.poll() is None and not did_timeout:
- # Have to use readline rather than readlines() or "for line in p.stdout:",
- # otherwise we get buffered even with bufsize=0.
- line = p.stdout.readline()
- while line and not did_timeout:
- sys.stdout.write(line)
- sys.stdout.flush()
- line = p.stdout.readline()
- if timeout > 0:
- did_timeout = time.time() > wait_until
-
- if did_timeout:
- logging.info("process timed out")
- else:
- logging.info("process ended, did not time out")
-
- if did_timeout:
- if IsWindows():
- subprocess.call(["taskkill", "/T", "/F", "/PID", str(p.pid)])
- else:
- # Does this kill all children, too?
- os.kill(p.pid, signal.SIGINT)
- logging.error("KILLED %d" % p.pid)
- # Give the process a chance to actually die before continuing
- # so that cleanup can happen safely.
- time.sleep(1.0)
- logging.error("TIMEOUT waiting for %s" % proc[0])
- raise TimeoutError(proc[0])
- else:
- for line in p.stdout:
- sys.stdout.write(line)
- if not IsMac(): # stdout flush fails on Mac
- logging.info("flushing stdout")
- sys.stdout.flush()
-
- logging.info("collecting result code")
- result = p.poll()
- if result:
- logging.error("%s exited with non-zero result code %d" % (proc[0], result))
- return result
-
-
-def IsLinux():
- return sys.platform.startswith('linux')
-
-
-def IsMac():
- return sys.platform.startswith('darwin')
-
-
-def IsWindows():
- return sys.platform == 'cygwin' or sys.platform.startswith('win')
-
-
-def WindowsVersionName():
- """Returns the name of the Windows version if it is known, or None.
-
- Possible return values are: xp, vista, 7, 8, or None
- """
- if sys.platform == 'cygwin':
- # Windows version number is hiding in system name. Looks like:
- # CYGWIN_NT-6.1-WOW64
- try:
- version_str = platform.uname()[0].split('-')[1]
- except:
- return None
- elif sys.platform.startswith('win'):
- # Normal Windows version string. Mine: 6.1.7601
- version_str = platform.version()
- else:
- return None
-
- parts = version_str.split('.')
- try:
- major = int(parts[0])
- minor = int(parts[1])
- except:
- return None # Can't parse, unknown version.
-
- if major == 5:
- return 'xp'
- elif major == 6 and minor == 0:
- return 'vista'
- elif major == 6 and minor == 1:
- return '7'
- elif major == 6 and minor == 2:
- return '8' # Future proof. ;)
- return None
-
-
-def PlatformNames():
- """Return an array of string to be used in paths for the platform
- (e.g. suppressions, gtest filters, ignore files etc.)
- The first element of the array describes the 'main' platform
- """
- if IsLinux():
- return ['linux']
- if IsMac():
- return ['mac']
- if IsWindows():
- names = ['win32']
- version_name = WindowsVersionName()
- if version_name is not None:
- names.append('win-%s' % version_name)
- return names
- raise NotImplementedError('Unknown platform "%s".' % sys.platform)
-
-
-def PutEnvAndLog(env_name, env_value):
- os.putenv(env_name, env_value)
- logging.info('export %s=%s', env_name, env_value)
-
-def BoringCallers(mangled, use_re_wildcards):
- """Return a list of 'boring' function names (optinally mangled)
- with */? wildcards (optionally .*/.).
- Boring = we drop off the bottom of stack traces below such functions.
- """
-
- need_mangling = [
- # Don't show our testing framework:
- ("testing::Test::Run", "_ZN7testing4Test3RunEv"),
- ("testing::TestInfo::Run", "_ZN7testing8TestInfo3RunEv"),
- ("testing::internal::Handle*ExceptionsInMethodIfSupported*",
- "_ZN7testing8internal3?Handle*ExceptionsInMethodIfSupported*"),
-
- # Depend on scheduling:
- ("MessageLoop::Run", "_ZN11MessageLoop3RunEv"),
- ("MessageLoop::RunTask", "_ZN11MessageLoop7RunTask*"),
- ("RunnableMethod*", "_ZN14RunnableMethod*"),
- ("DispatchToMethod*", "_Z*16DispatchToMethod*"),
- ("base::internal::Invoker*::DoInvoke*",
- "_ZN4base8internal8Invoker*DoInvoke*"), # Invoker{1,2,3}
- ("base::internal::RunnableAdapter*::Run*",
- "_ZN4base8internal15RunnableAdapter*Run*"),
- ]
-
- ret = []
- for pair in need_mangling:
- ret.append(pair[1 if mangled else 0])
-
- ret += [
- # Also don't show the internals of libc/pthread.
- "start_thread",
- "main",
- "BaseThreadInitThunk",
- ]
-
- if use_re_wildcards:
- for i in range(0, len(ret)):
- ret[i] = ret[i].replace('*', '.*').replace('?', '.')
-
- return ret
-
-def NormalizeWindowsPath(path):
- """If we're using Cygwin Python, turn the path into a Windows path.
-
- Don't turn forward slashes into backslashes for easier copy-pasting and
- escaping.
-
- TODO(rnk): If we ever want to cut out the subprocess invocation, we can use
- _winreg to get the root Cygwin directory from the registry key:
- HKEY_LOCAL_MACHINE\SOFTWARE\Cygwin\setup\rootdir.
- """
- if sys.platform.startswith("cygwin"):
- p = subprocess.Popen(["cygpath", "-m", path],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- (out, err) = p.communicate()
- if err:
- logging.warning("WARNING: cygpath error: %s", err)
- return out.strip()
- else:
- return path
-
-############################
-# Common output format code
-
-def PrintUsedSuppressionsList(suppcounts):
- """ Prints out the list of used suppressions in a format common to all the
- memory tools. If the list is empty, prints nothing and returns False,
- otherwise True.
-
- suppcounts: a dictionary of used suppression counts,
- Key -> name, Value -> count.
- """
- if not suppcounts:
- return False
-
- print "-----------------------------------------------------"
- print "Suppressions used:"
- print " count name"
- for (name, count) in sorted(suppcounts.items(), key=lambda (k,v): (v,k)):
- print "%7d %s" % (count, name)
- print "-----------------------------------------------------"
- sys.stdout.flush()
- return True
diff --git a/files/tools_libyuv/valgrind/gdb_helper.py b/files/tools_libyuv/valgrind/gdb_helper.py
deleted file mode 100644
index d127f760..00000000
--- a/files/tools_libyuv/valgrind/gdb_helper.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-''' A bunch of helper functions for querying gdb.'''
-
-import logging
-import os
-import re
-import tempfile
-
-GDB_LINE_RE = re.compile(r'Line ([0-9]*) of "([^"]*)".*')
-
-def _GdbOutputToFileLine(output_line):
- ''' Parse the gdb output line, return a pair (file, line num) '''
- match = GDB_LINE_RE.match(output_line)
- if match:
- return match.groups()[1], match.groups()[0]
- else:
- return None
-
-def ResolveAddressesWithinABinary(binary_name, load_address, address_list):
- ''' For each address, return a pair (file, line num) '''
- commands = tempfile.NamedTemporaryFile()
- commands.write('add-symbol-file "%s" %s\n' % (binary_name, load_address))
- for addr in address_list:
- commands.write('info line *%s\n' % addr)
- commands.write('quit\n')
- commands.flush()
- gdb_commandline = 'gdb -batch -x %s 2>/dev/null' % commands.name
- gdb_pipe = os.popen(gdb_commandline)
- result = gdb_pipe.readlines()
-
- address_count = 0
- ret = {}
- for line in result:
- if line.startswith('Line'):
- ret[address_list[address_count]] = _GdbOutputToFileLine(line)
- address_count += 1
- if line.startswith('No line'):
- ret[address_list[address_count]] = (None, None)
- address_count += 1
- gdb_pipe.close()
- commands.close()
- return ret
-
-class AddressTable(object):
- ''' Object to do batched line number lookup. '''
- def __init__(self):
- self._load_addresses = {}
- self._binaries = {}
- self._all_resolved = False
-
- def AddBinaryAt(self, binary, load_address):
- ''' Register a new shared library or executable. '''
- self._load_addresses[binary] = load_address
-
- def Add(self, binary, address):
- ''' Register a lookup request. '''
- if binary == '':
- logging.warn('adding address %s in empty binary?' % address)
- if binary in self._binaries:
- self._binaries[binary].append(address)
- else:
- self._binaries[binary] = [address]
- self._all_resolved = False
-
- def ResolveAll(self):
- ''' Carry out all lookup requests. '''
- self._translation = {}
- for binary in self._binaries.keys():
- if binary != '' and binary in self._load_addresses:
- load_address = self._load_addresses[binary]
- addr = ResolveAddressesWithinABinary(
- binary, load_address, self._binaries[binary])
- self._translation[binary] = addr
- self._all_resolved = True
-
- def GetFileLine(self, binary, addr):
- ''' Get the (filename, linenum) result of a previously-registered lookup
- request.
- '''
- if self._all_resolved:
- if binary in self._translation:
- if addr in self._translation[binary]:
- return self._translation[binary][addr]
- return (None, None)
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.bat b/files/tools_libyuv/valgrind/libyuv_tests.bat
deleted file mode 100644
index 5fceca67..00000000
--- a/files/tools_libyuv/valgrind/libyuv_tests.bat
+++ /dev/null
@@ -1,79 +0,0 @@
-@echo off
-:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-::
-:: Use of this source code is governed by a BSD-style license
-:: that can be found in the LICENSE file in the root of the source
-:: tree. An additional intellectual property rights grant can be found
-:: in the file PATENTS. All contributing project authors may
-:: be found in the AUTHORS file in the root of the source tree.
-
-:: This script is a copy of chrome_tests.bat with the following changes:
-:: - Invokes libyuv_tests.py instead of chrome_tests.py
-:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
-:: it possible to execute the Python scripts properly.
-
-:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
-set THISDIR=%~dp0
-set TOOL_NAME="unknown"
-
-:: Get the tool name and put it into TOOL_NAME {{{1
-:: NB: SHIFT command doesn't modify %*
-:PARSE_ARGS_LOOP
- if %1 == () GOTO:TOOLNAME_NOT_FOUND
- if %1 == --tool GOTO:TOOLNAME_FOUND
- SHIFT
- goto :PARSE_ARGS_LOOP
-
-:TOOLNAME_NOT_FOUND
-echo "Please specify a tool (tsan or drmemory) by using --tool flag"
-exit /B 1
-
-:TOOLNAME_FOUND
-SHIFT
-set TOOL_NAME=%1
-:: }}}
-if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
-echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
-exit /B 1
-
-:SETUP_DRMEMORY
-if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
-:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
-set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
-set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
-if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
-echo "Can't find Dr. Memory executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:DRMEMORY_BINARY_OK
-%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
-set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
-:: }}}
-goto :RUN_TESTS
-
-:SETUP_TSAN
-:: Set up PIN_COMMAND to invoke TSan {{{1
-set TSAN_PATH=%THISDIR%..\..\third_party\tsan
-set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
-if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
-echo "Can't find ThreadSanitizer executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:TSAN_BINARY_OK
-%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
-set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
-:: }}}
-goto :RUN_TESTS
-
-:RUN_TESTS
-set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
-set RUNNING_ON_VALGRIND=yes
-python %THISDIR%libyuv_tests.py %*
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.py b/files/tools_libyuv/valgrind/libyuv_tests.py
deleted file mode 100755
index e780bd95..00000000
--- a/files/tools_libyuv/valgrind/libyuv_tests.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Runs various libyuv tests through valgrind_test.py.
-
-This script inherits the chrome_tests.py in Chrome, but allows running any test
-instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and
-only supports specifying a single test for each run.
-
-Suppression files:
-The Chrome valgrind directory we use as a DEPS dependency contains the following
-suppression files:
- valgrind/memcheck/suppressions.txt
- valgrind/memcheck/suppressions_mac.txt
- valgrind/tsan/suppressions.txt
- valgrind/tsan/suppressions_mac.txt
- valgrind/tsan/suppressions_win32.txt
-Since they're referenced from the chrome_tests.py script, we have similar files
-below the directory of this script. When executing, this script will setup both
-Chrome's suppression files and our own, so we can easily maintain libyuv
-specific suppressions in our own files.
-"""
-
-import logging
-import optparse
-import os
-import sys
-
-import logging_utils
-import path_utils
-
-import chrome_tests
-
-
-class LibyuvTest(chrome_tests.ChromeTests):
- """Class that handles setup of suppressions for libyuv.
-
- Everything else is inherited from chrome_tests.ChromeTests.
- """
-
- def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
- """Override command-building method so we can add more suppressions."""
- cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe,
- valgrind_test_args)
- # When ChromeTests._DefaultCommand has executed, it has setup suppression
- # files based on what's found in the memcheck/ or tsan/ subdirectories of
- # this script's location. If Mac or Windows is executing, additional
- # platform specific files have also been added.
- # Since only the ones located below this directory is added, we must also
- # add the ones maintained by Chrome, located in ../../tools/valgrind.
-
- # The idea is to look for --suppression arguments in the cmd list and add a
- # modified copy of each suppression file, for the corresponding file in
- # ../../tools/valgrind.
- script_dir = path_utils.ScriptDir()
- old_base, _ = os.path.split(script_dir)
-
- checkout_src = os.path.abspath(os.path.join(script_dir, os.pardir,
- os.pardir))
- new_dir = os.path.join(checkout_src, 'tools', 'valgrind')
- add_suppressions = []
- for token in cmd:
- if '--suppressions' in token:
- add_suppressions.append(token.replace(script_dir, new_dir))
- return add_suppressions + cmd
-
-
-def main(_):
- parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
- parser.disable_interspersed_args()
- parser.add_option('-b', '--build-dir',
- help=('Location of the compiler output. Can only be used '
- 'when the test argument does not contain this path.'))
- parser.add_option("--target", help="Debug or Release")
- parser.add_option('-t', '--test', help='Test to run.')
- parser.add_option('', '--baseline', action='store_true', default=False,
- help='Generate baseline data instead of validating')
- parser.add_option('', '--gtest_filter',
- help='Additional arguments to --gtest_filter')
- parser.add_option('', '--gtest_repeat',
- help='Argument for --gtest_repeat')
- parser.add_option("--gtest_shuffle", action="store_true", default=False,
- help="Randomize tests' orders on every iteration.")
- parser.add_option("--gtest_break_on_failure", action="store_true",
- default=False,
- help="Drop in to debugger on assertion failure. Also "
- "useful for forcing tests to exit with a stack dump "
- "on the first assertion failure when running with "
- "--gtest_repeat=-1")
- parser.add_option('-v', '--verbose', action='store_true', default=False,
- help='Verbose output - enable debug log messages')
- parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
- help='Specify a valgrind tool to run the tests under')
- parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='',
- help='Specify custom flags for the selected valgrind tool')
- parser.add_option('', '--keep_logs', action='store_true', default=False,
- help=('Store memory tool logs in the <tool>.logs directory '
- 'instead of /tmp.\nThis can be useful for tool '
- 'developers/maintainers.\nPlease note that the <tool>'
- '.logs directory will be clobbered on tool startup.'))
- parser.add_option("--test-launcher-bot-mode", action="store_true",
- help="run the tests with --test-launcher-bot-mode")
- parser.add_option("--test-launcher-total-shards", type=int,
- help="run the tests with --test-launcher-total-shards")
- parser.add_option("--test-launcher-shard-index", type=int,
- help="run the tests with --test-launcher-shard-index")
- options, args = parser.parse_args()
-
- if options.verbose:
- logging_utils.config_root(logging.DEBUG)
- else:
- logging_utils.config_root()
-
- if not options.test:
- parser.error('--test not specified')
-
- # Support build dir both with and without the target.
- if (options.target and options.build_dir and
- not options.build_dir.endswith(options.target)):
- options.build_dir = os.path.join(options.build_dir, options.target)
-
- # If --build_dir is provided, prepend it to the test executable if needed.
- test_executable = options.test
- if options.build_dir and not test_executable.startswith(options.build_dir):
- test_executable = os.path.join(options.build_dir, test_executable)
- args = [test_executable] + args
-
- test = LibyuvTest(options, args, 'cmdline')
- return test.Run()
-
-if __name__ == '__main__':
- return_code = main(sys.argv)
- sys.exit(return_code)
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.sh b/files/tools_libyuv/valgrind/libyuv_tests.sh
deleted file mode 100755
index 249032ca..00000000
--- a/files/tools_libyuv/valgrind/libyuv_tests.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Set up some paths and re-direct the arguments to libyuv_tests.py
-
-# This script is a copy of the chrome_tests.sh wrapper script with the following
-# changes:
-# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate
-# the Valgrind framework install. If it fails a fallback path is used instead
-# (../../chromium/src/third_party/valgrind/linux_x64) and a warning message
-# is showed by |show_locate_valgrind_failed_warning|.
-# - libyuv_tests.py is invoked instead of chrome_tests.py.
-# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it
-# possible to execute the Python scripts properly.
-
-export THISDIR=`dirname $0`
-ARGV_COPY="$@"
-
-# We need to set CHROME_VALGRIND iff using Memcheck:
-# tools_libyuv/valgrind/libyuv_tests.sh --tool memcheck
-# or
-# tools_libyuv/valgrind/libyuv_tests.sh --tool=memcheck
-tool="memcheck" # Default to memcheck.
-while (( "$#" ))
-do
- if [[ "$1" == "--tool" ]]
- then
- tool="$2"
- shift
- elif [[ "$1" =~ --tool=(.*) ]]
- then
- tool="${BASH_REMATCH[1]}"
- fi
- shift
-done
-
-NEEDS_VALGRIND=0
-
-case "$tool" in
- "memcheck")
- NEEDS_VALGRIND=1
- ;;
-esac
-
-# For libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind
-# scripts dir to locate the Valgrind framework install
-CHROME_VALGRIND_SCRIPTS=$THISDIR/../../tools/valgrind
-
-if [ "$NEEDS_VALGRIND" == "1" ]
-then
- CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
- if [ "$CHROME_VALGRIND" = "" ]
- then
- CHROME_VALGRIND=../../src/third_party/valgrind/linux_x64
- echo
- echo "-------------------- WARNING ------------------------"
- echo "locate_valgrind.sh failed."
- echo "Using $CHROME_VALGRIND as a fallback location."
- echo "This might be because:"
- echo "1) This is a swarming bot"
- echo "2) You haven't set up the valgrind binaries correctly."
- echo "In this case, please make sure you have followed the instructions at"
- echo "http://www.chromium.org/developers/how-tos/using-valgrind/get-valgrind"
- echo "Notice: In the .gclient file, you need to add this for the 'libyuv'"
- echo "solution since our directory structure is different from Chromium's:"
- echo "\"custom_deps\": {"
- echo " \"libyuv/third_party/valgrind\":"
- echo " \"https://chromium.googlesource.com/chromium/deps/valgrind/binaries\","
- echo "},"
- echo "-----------------------------------------------------"
- echo
- fi
- echo "Using valgrind binaries from ${CHROME_VALGRIND}"
-
- PATH="${CHROME_VALGRIND}/bin:$PATH"
- # We need to set these variables to override default lib paths hard-coded into
- # Valgrind binary.
- export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
- export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
-
- # Clean up some /tmp directories that might be stale due to interrupted
- # chrome_tests.py execution.
- # FYI:
- # -mtime +1 <- only print files modified more than 24h ago,
- # -print0/-0 are needed to handle possible newlines in the filenames.
- echo "Cleanup /tmp from Valgrind stuff"
- find /tmp -maxdepth 1 \(\
- -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
- \) -mtime +1 -print0 | xargs -0 rm -rf
-fi
-
-# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains
-# the scripts that are needed for this script to run
-PYTHONPATH=$THISDIR/../../tools/python/google:$CHROME_VALGRIND_SCRIPTS python \
- "$THISDIR/libyuv_tests.py" $ARGV_COPY
diff --git a/files/tools_libyuv/valgrind/locate_valgrind.sh b/files/tools_libyuv/valgrind/locate_valgrind.sh
deleted file mode 100755
index d9594f48..00000000
--- a/files/tools_libyuv/valgrind/locate_valgrind.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Prints a path to Valgrind binaries to be used for Chromium.
-# Select the valgrind from third_party/valgrind by default,
-# but allow users to override this default without editing scripts and
-# without specifying a commandline option
-
-export THISDIR=`dirname $0`
-
-# User may use their own valgrind by giving its path with CHROME_VALGRIND env.
-if [ "$CHROME_VALGRIND" = "" ]
-then
- # Guess which binaries we should use by uname
- case "$(uname -a)" in
- *Linux*x86_64*)
- PLATFORM="linux_x64"
- ;;
- *Linux*86*)
- PLATFORM="linux_x86"
- ;;
- *Darwin*9.[678].[01]*i386*)
- # Didn't test other kernels.
- PLATFORM="mac"
- ;;
- *Darwin*10.[0-9].[0-9]*i386*)
- PLATFORM="mac_10.6"
- ;;
- *Darwin*10.[0-9].[0-9]*x86_64*)
- PLATFORM="mac_10.6"
- ;;
- *Darwin*11.[0-9].[0-9]*x86_64*)
- PLATFORM="mac_10.7"
- ;;
- *)
- (echo "Sorry, your platform is not supported:" &&
- uname -a
- echo
- echo "If you're on Mac OS X, please see http://crbug.com/441425") >&2
- exit 42
- esac
-
- # The binaries should be in third_party/valgrind
- # (checked out from deps/third_party/valgrind/binaries).
- CHROME_VALGRIND="$THISDIR/../../third_party/valgrind/$PLATFORM"
-
- # TODO(timurrrr): readlink -f is not present on Mac...
- if [ "$PLATFORM" != "mac" ] && \
- [ "$PLATFORM" != "mac_10.6" ] && \
- [ "$PLATFORM" != "mac_10.7" ]
- then
- # Get rid of all "../" dirs
- CHROME_VALGRIND=$(readlink -f $CHROME_VALGRIND)
- fi
-fi
-
-if ! test -x $CHROME_VALGRIND/bin/valgrind
-then
- echo "Oops, could not find Valgrind binaries in your checkout." >&2
- echo "Please see" >&2
- echo " http://dev.chromium.org/developers/how-tos/using-valgrind/get-valgrind" >&2
- echo "for the instructions on how to download pre-built binaries." >&2
- exit 1
-fi
-
-echo $CHROME_VALGRIND
diff --git a/files/tools_libyuv/valgrind/memcheck/OWNERS b/files/tools_libyuv/valgrind/memcheck/OWNERS
deleted file mode 100644
index 72e8ffc0..00000000
--- a/files/tools_libyuv/valgrind/memcheck/OWNERS
+++ /dev/null
@@ -1 +0,0 @@
-*
diff --git a/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py b/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
deleted file mode 100644
index 03329214..00000000
--- a/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""
-Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py
-
-See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
-for more details on the presubmit API built into gcl.
-"""
-
-import os
-import re
-import sys
-
-def CheckChange(input_api, output_api):
- """Checks the memcheck suppressions files for bad data."""
-
- # Add the path to the Chrome valgrind dir to the import path:
- tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..', '..',
- 'tools', 'valgrind')
- sys.path.append(tools_vg_path)
- import suppressions
-
- sup_regex = re.compile('suppressions.*\.txt$')
- suppressions = {}
- errors = []
- check_for_memcheck = False
- # skip_next_line has 3 possible values:
- # - False: don't skip the next line.
- # - 'skip_suppression_name': the next line is a suppression name, skip.
- # - 'skip_param': the next line is a system call parameter error, skip.
- skip_next_line = False
- for f in filter(lambda x: sup_regex.search(x.LocalPath()),
- input_api.AffectedFiles()):
- for line, line_num in zip(f.NewContents(),
- xrange(1, len(f.NewContents()) + 1)):
- line = line.lstrip()
- if line.startswith('#') or not line:
- continue
-
- if skip_next_line:
- if skip_next_line == 'skip_suppression_name':
- if 'insert_a_suppression_name_here' in line:
- errors.append('"insert_a_suppression_name_here" is not a valid '
- 'suppression name')
- if suppressions.has_key(line):
- if f.LocalPath() == suppressions[line][1]:
- errors.append('suppression with name "%s" at %s line %s '
- 'has already been defined at line %s' %
- (line, f.LocalPath(), line_num,
- suppressions[line][1]))
- else:
- errors.append('suppression with name "%s" at %s line %s '
- 'has already been defined at %s line %s' %
- (line, f.LocalPath(), line_num,
- suppressions[line][0], suppressions[line][1]))
- else:
- suppressions[line] = (f, line_num)
- check_for_memcheck = True;
- skip_next_line = False
- continue
- if check_for_memcheck:
- if not line.startswith('Memcheck:'):
- errors.append('"%s" should be "Memcheck:..." in %s line %s' %
- (line, f.LocalPath(), line_num))
- check_for_memcheck = False;
- if line == '{':
- skip_next_line = 'skip_suppression_name'
- continue
- if line == "Memcheck:Param":
- skip_next_line = 'skip_param'
- continue
-
- if (line.startswith('fun:') or line.startswith('obj:') or
- line.startswith('Memcheck:') or line == '}' or
- line == '...'):
- continue
- errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(),
- line_num))
- if errors:
- return [output_api.PresubmitError('\n'.join(errors))]
- return []
-
-def CheckChangeOnUpload(input_api, output_api):
- return CheckChange(input_api, output_api)
-
-def CheckChangeOnCommit(input_api, output_api):
- return CheckChange(input_api, output_api)
-
-def GetPreferredTrySlaves():
- # We don't have any memcheck slaves yet, so there's no use for this method.
- # When we have, the slave name(s) should be put into this list.
- return []
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions.txt b/files/tools_libyuv/valgrind/memcheck/suppressions.txt
deleted file mode 100644
index 3f0f6d44..00000000
--- a/files/tools_libyuv/valgrind/memcheck/suppressions.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# This file is used in addition to the one already maintained in Chrome.
-# It acts as a place holder for future additions for this project.
-# It must exist for the Python wrapper script to work properly.
-
-# There are two of suppressions in this file.
-# 1. third_party libraries
-# 2. libyuv stuff
-# 3. libjingle stuff (talk folder)
-#-----------------------------------------------------------------------
-
-# third_party libraries
-{
- bug_729
- Memcheck:Free
- fun:_ZdaPv
- ...
- fun:_ZN7testing8internal12UnitTestImplD1Ev
- ...
-}
-
-# libyuv (empty so far)
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
deleted file mode 100644
index 3ad0c8cc..00000000
--- a/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is used in addition to the one already maintained in Chrome.
-# It acts as a place holder for future additions for this project.
-# It must exist for the Python wrapper script to work properly.
-
-
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
deleted file mode 100644
index 3ad0c8cc..00000000
--- a/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is used in addition to the one already maintained in Chrome.
-# It acts as a place holder for future additions for this project.
-# It must exist for the Python wrapper script to work properly.
-
-
diff --git a/files/tools_libyuv/valgrind/memcheck_analyze.py b/files/tools_libyuv/valgrind/memcheck_analyze.py
deleted file mode 100755
index 80e85eb4..00000000
--- a/files/tools_libyuv/valgrind/memcheck_analyze.py
+++ /dev/null
@@ -1,644 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# memcheck_analyze.py
-
-''' Given a valgrind XML file, parses errors and uniques them.'''
-
-import gdb_helper
-
-from collections import defaultdict
-import hashlib
-import logging
-import optparse
-import os
-import re
-import subprocess
-import sys
-import time
-from xml.dom.minidom import parse
-from xml.parsers.expat import ExpatError
-
-import common
-
-# Global symbol table (yuck)
-TheAddressTable = None
-
-# These are regexps that define functions (using C++ mangled names)
-# we don't want to see in stack traces while pretty printing
-# or generating suppressions.
-# Just stop printing the stack/suppression frames when the current one
-# matches any of these.
-_BORING_CALLERS = common.BoringCallers(mangled=True, use_re_wildcards=True)
-
-def getTextOf(top_node, name):
- ''' Returns all text in all DOM nodes with a certain |name| that are children
- of |top_node|.
- '''
-
- text = ""
- for nodes_named in top_node.getElementsByTagName(name):
- text += "".join([node.data for node in nodes_named.childNodes
- if node.nodeType == node.TEXT_NODE])
- return text
-
-def getCDATAOf(top_node, name):
- ''' Returns all CDATA in all DOM nodes with a certain |name| that are children
- of |top_node|.
- '''
-
- text = ""
- for nodes_named in top_node.getElementsByTagName(name):
- text += "".join([node.data for node in nodes_named.childNodes
- if node.nodeType == node.CDATA_SECTION_NODE])
- if (text == ""):
- return None
- return text
-
-def shortenFilePath(source_dir, directory):
- '''Returns a string with the string prefix |source_dir| removed from
- |directory|.'''
- prefixes_to_cut = ["build/src/", "valgrind/coregrind/", "out/Release/../../"]
-
- if source_dir:
- prefixes_to_cut.append(source_dir)
-
- for p in prefixes_to_cut:
- index = directory.rfind(p)
- if index != -1:
- directory = directory[index + len(p):]
-
- return directory
-
-# Constants that give real names to the abbreviations in valgrind XML output.
-INSTRUCTION_POINTER = "ip"
-OBJECT_FILE = "obj"
-FUNCTION_NAME = "fn"
-SRC_FILE_DIR = "dir"
-SRC_FILE_NAME = "file"
-SRC_LINE = "line"
-
-def gatherFrames(node, source_dir):
- frames = []
- for frame in node.getElementsByTagName("frame"):
- frame_dict = {
- INSTRUCTION_POINTER : getTextOf(frame, INSTRUCTION_POINTER),
- OBJECT_FILE : getTextOf(frame, OBJECT_FILE),
- FUNCTION_NAME : getTextOf(frame, FUNCTION_NAME),
- SRC_FILE_DIR : shortenFilePath(
- source_dir, getTextOf(frame, SRC_FILE_DIR)),
- SRC_FILE_NAME : getTextOf(frame, SRC_FILE_NAME),
- SRC_LINE : getTextOf(frame, SRC_LINE)
- }
-
- # Ignore this frame and all the following if it's a "boring" function.
- enough_frames = False
- for regexp in _BORING_CALLERS:
- if re.match("^%s$" % regexp, frame_dict[FUNCTION_NAME]):
- enough_frames = True
- break
- if enough_frames:
- break
-
- frames += [frame_dict]
-
- global TheAddressTable
- if TheAddressTable != None and frame_dict[SRC_LINE] == "":
- # Try using gdb
- TheAddressTable.Add(frame_dict[OBJECT_FILE],
- frame_dict[INSTRUCTION_POINTER])
- return frames
-
-class ValgrindError:
- ''' Takes a <DOM Element: error> node and reads all the data from it. A
- ValgrindError is immutable and is hashed on its pretty printed output.
- '''
-
- def __init__(self, source_dir, error_node, commandline, testcase):
- ''' Copies all the relevant information out of the DOM and into object
- properties.
-
- Args:
- error_node: The <error></error> DOM node we're extracting from.
- source_dir: Prefix that should be stripped from the <dir> node.
- commandline: The command that was run under valgrind
- testcase: The test case name, if known.
- '''
-
- # Valgrind errors contain one <what><stack> pair, plus an optional
- # <auxwhat><stack> pair, plus an optional <origin><what><stack></origin>,
- # plus (since 3.5.0) a <suppression></suppression> pair.
- # (Origin is nicely enclosed; too bad the other two aren't.)
- # The most common way to see all three in one report is
- # a syscall with a parameter that points to uninitialized memory, e.g.
- # Format:
- # <error>
- # <unique>0x6d</unique>
- # <tid>1</tid>
- # <kind>SyscallParam</kind>
- # <what>Syscall param write(buf) points to uninitialised byte(s)</what>
- # <stack>
- # <frame>
- # ...
- # </frame>
- # </stack>
- # <auxwhat>Address 0x5c9af4f is 7 bytes inside a block of ...</auxwhat>
- # <stack>
- # <frame>
- # ...
- # </frame>
- # </stack>
- # <origin>
- # <what>Uninitialised value was created by a heap allocation</what>
- # <stack>
- # <frame>
- # ...
- # </frame>
- # </stack>
- # </origin>
- # <suppression>
- # <sname>insert_a_suppression_name_here</sname>
- # <skind>Memcheck:Param</skind>
- # <skaux>write(buf)</skaux>
- # <sframe> <fun>__write_nocancel</fun> </sframe>
- # ...
- # <sframe> <fun>main</fun> </sframe>
- # <rawtext>
- # <![CDATA[
- # {
- # <insert_a_suppression_name_here>
- # Memcheck:Param
- # write(buf)
- # fun:__write_nocancel
- # ...
- # fun:main
- # }
- # ]]>
- # </rawtext>
- # </suppression>
- # </error>
- #
- # Each frame looks like this:
- # <frame>
- # <ip>0x83751BC</ip>
- # <obj>/data/dkegel/chrome-build/src/out/Release/base_unittests</obj>
- # <fn>_ZN7testing8internal12TestInfoImpl7RunTestEPNS_8TestInfoE</fn>
- # <dir>/data/dkegel/chrome-build/src/testing/gtest/src</dir>
- # <file>gtest-internal-inl.h</file>
- # <line>655</line>
- # </frame>
- # although the dir, file, and line elements are missing if there is
- # no debug info.
-
- self._kind = getTextOf(error_node, "kind")
- self._backtraces = []
- self._suppression = None
- self._commandline = commandline
- self._testcase = testcase
- self._additional = []
-
- # Iterate through the nodes, parsing <what|auxwhat><stack> pairs.
- description = None
- for node in error_node.childNodes:
- if node.localName == "what" or node.localName == "auxwhat":
- description = "".join([n.data for n in node.childNodes
- if n.nodeType == n.TEXT_NODE])
- elif node.localName == "xwhat":
- description = getTextOf(node, "text")
- elif node.localName == "stack":
- assert description
- self._backtraces.append([description, gatherFrames(node, source_dir)])
- description = None
- elif node.localName == "origin":
- description = getTextOf(node, "what")
- stack = node.getElementsByTagName("stack")[0]
- frames = gatherFrames(stack, source_dir)
- self._backtraces.append([description, frames])
- description = None
- stack = None
- frames = None
- elif description and node.localName != None:
- # The lastest description has no stack, e.g. "Address 0x28 is unknown"
- self._additional.append(description)
- description = None
-
- if node.localName == "suppression":
- self._suppression = getCDATAOf(node, "rawtext");
-
- def __str__(self):
- ''' Pretty print the type and backtrace(s) of this specific error,
- including suppression (which is just a mangled backtrace).'''
- output = ""
- output += "\n" # Make sure the ### is at the beginning of line.
- output += "### BEGIN MEMORY TOOL REPORT (error hash=#%016X#)\n" % \
- self.ErrorHash()
- if (self._commandline):
- output += self._commandline + "\n"
-
- output += self._kind + "\n"
- for backtrace in self._backtraces:
- output += backtrace[0] + "\n"
- filter = subprocess.Popen("c++filt -n", stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- shell=True,
- close_fds=True)
- buf = ""
- for frame in backtrace[1]:
- buf += (frame[FUNCTION_NAME] or frame[INSTRUCTION_POINTER]) + "\n"
- (stdoutbuf, stderrbuf) = filter.communicate(buf.encode('latin-1'))
- demangled_names = stdoutbuf.split("\n")
-
- i = 0
- for frame in backtrace[1]:
- output += (" " + demangled_names[i])
- i = i + 1
-
- global TheAddressTable
- if TheAddressTable != None and frame[SRC_FILE_DIR] == "":
- # Try using gdb
- foo = TheAddressTable.GetFileLine(frame[OBJECT_FILE],
- frame[INSTRUCTION_POINTER])
- if foo[0] != None:
- output += (" (" + foo[0] + ":" + foo[1] + ")")
- elif frame[SRC_FILE_DIR] != "":
- output += (" (" + frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME] +
- ":" + frame[SRC_LINE] + ")")
- else:
- output += " (" + frame[OBJECT_FILE] + ")"
- output += "\n"
-
- for additional in self._additional:
- output += additional + "\n"
-
- assert self._suppression != None, "Your Valgrind doesn't generate " \
- "suppressions - is it too old?"
-
- if self._testcase:
- output += "The report came from the `%s` test.\n" % self._testcase
- output += "Suppression (error hash=#%016X#):\n" % self.ErrorHash()
- output += (" For more info on using suppressions see "
- "http://dev.chromium.org/developers/tree-sheriffs/sheriff-details-chromium/memory-sheriff#TOC-Suppressing-memory-reports")
-
- # Widen suppression slightly to make portable between mac and linux
- # TODO(timurrrr): Oops, these transformations should happen
- # BEFORE calculating the hash!
- supp = self._suppression;
- supp = supp.replace("fun:_Znwj", "fun:_Znw*")
- supp = supp.replace("fun:_Znwm", "fun:_Znw*")
- supp = supp.replace("fun:_Znaj", "fun:_Zna*")
- supp = supp.replace("fun:_Znam", "fun:_Zna*")
-
- # Make suppressions even less platform-dependent.
- for sz in [1, 2, 4, 8]:
- supp = supp.replace("Memcheck:Addr%d" % sz, "Memcheck:Unaddressable")
- supp = supp.replace("Memcheck:Value%d" % sz, "Memcheck:Uninitialized")
- supp = supp.replace("Memcheck:Cond", "Memcheck:Uninitialized")
-
- # Split into lines so we can enforce length limits
- supplines = supp.split("\n")
- supp = None # to avoid re-use
-
- # Truncate at line 26 (VG_MAX_SUPP_CALLERS plus 2 for name and type)
- # or at the first 'boring' caller.
- # (https://bugs.kde.org/show_bug.cgi?id=199468 proposes raising
- # VG_MAX_SUPP_CALLERS, but we're probably fine with it as is.)
- newlen = min(26, len(supplines));
-
- # Drop boring frames and all the following.
- enough_frames = False
- for frameno in range(newlen):
- for boring_caller in _BORING_CALLERS:
- if re.match("^ +fun:%s$" % boring_caller, supplines[frameno]):
- newlen = frameno
- enough_frames = True
- break
- if enough_frames:
- break
- if (len(supplines) > newlen):
- supplines = supplines[0:newlen]
- supplines.append("}")
-
- for frame in range(len(supplines)):
- # Replace the always-changing anonymous namespace prefix with "*".
- m = re.match("( +fun:)_ZN.*_GLOBAL__N_.*\.cc_" +
- "[0-9a-fA-F]{8}_[0-9a-fA-F]{8}(.*)",
- supplines[frame])
- if m:
- supplines[frame] = "*".join(m.groups())
-
- output += "\n".join(supplines) + "\n"
- output += "### END MEMORY TOOL REPORT (error hash=#%016X#)\n" % \
- self.ErrorHash()
-
- return output
-
- def UniqueString(self):
- ''' String to use for object identity. Don't print this, use str(obj)
- instead.'''
- rep = self._kind + " "
- for backtrace in self._backtraces:
- for frame in backtrace[1]:
- rep += frame[FUNCTION_NAME]
-
- if frame[SRC_FILE_DIR] != "":
- rep += frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME]
- else:
- rep += frame[OBJECT_FILE]
-
- return rep
-
- # This is a device-independent hash identifying the suppression.
- # By printing out this hash we can find duplicate reports between tests and
- # different shards running on multiple buildbots
- def ErrorHash(self):
- return int(hashlib.md5(self.UniqueString()).hexdigest()[:16], 16)
-
- def __hash__(self):
- return hash(self.UniqueString())
- def __eq__(self, rhs):
- return self.UniqueString() == rhs
-
-def log_is_finished(f, force_finish):
- f.seek(0)
- prev_line = ""
- while True:
- line = f.readline()
- if line == "":
- if not force_finish:
- return False
- # Okay, the log is not finished but we can make it up to be parseable:
- if prev_line.strip() in ["</error>", "</errorcounts>", "</status>"]:
- f.write("</valgrindoutput>\n")
- return True
- return False
- if '</valgrindoutput>' in line:
- # Valgrind often has garbage after </valgrindoutput> upon crash.
- f.truncate()
- return True
- prev_line = line
-
-class MemcheckAnalyzer:
- ''' Given a set of Valgrind XML files, parse all the errors out of them,
- unique them and output the results.'''
-
- SANITY_TEST_SUPPRESSIONS = {
- "Memcheck sanity test 01 (memory leak).": 1,
- "Memcheck sanity test 02 (malloc/read left).": 1,
- "Memcheck sanity test 03 (malloc/read right).": 1,
- "Memcheck sanity test 04 (malloc/write left).": 1,
- "Memcheck sanity test 05 (malloc/write right).": 1,
- "Memcheck sanity test 06 (new/read left).": 1,
- "Memcheck sanity test 07 (new/read right).": 1,
- "Memcheck sanity test 08 (new/write left).": 1,
- "Memcheck sanity test 09 (new/write right).": 1,
- "Memcheck sanity test 10 (write after free).": 1,
- "Memcheck sanity test 11 (write after delete).": 1,
- "Memcheck sanity test 12 (array deleted without []).": 1,
- "Memcheck sanity test 13 (single element deleted with []).": 1,
- "Memcheck sanity test 14 (malloc/read uninit).": 1,
- "Memcheck sanity test 15 (new/read uninit).": 1,
- }
-
- # Max time to wait for memcheck logs to complete.
- LOG_COMPLETION_TIMEOUT = 180.0
-
- def __init__(self, source_dir, show_all_leaks=False, use_gdb=False):
- '''Create a parser for Memcheck logs.
-
- Args:
- source_dir: Path to top of source tree for this build
- show_all_leaks: Whether to show even less important leaks
- use_gdb: Whether to use gdb to resolve source filenames and line numbers
- in the report stacktraces
- '''
- self._source_dir = source_dir
- self._show_all_leaks = show_all_leaks
- self._use_gdb = use_gdb
-
- # Contains the set of unique errors
- self._errors = set()
-
- # Contains the time when the we started analyzing the first log file.
- # This variable is used to skip incomplete logs after some timeout.
- self._analyze_start_time = None
-
-
- def Report(self, files, testcase, check_sanity=False):
- '''Reads in a set of files and prints Memcheck report.
-
- Args:
- files: A list of filenames.
- check_sanity: if true, search for SANITY_TEST_SUPPRESSIONS
- '''
- # Beyond the detailed errors parsed by ValgrindError above,
- # the xml file contain records describing suppressions that were used:
- # <suppcounts>
- # <pair>
- # <count>28</count>
- # <name>pango_font_leak_todo</name>
- # </pair>
- # <pair>
- # <count>378</count>
- # <name>bug_13243</name>
- # </pair>
- # </suppcounts
- # Collect these and print them at the end.
- #
- # With our patch for https://bugs.kde.org/show_bug.cgi?id=205000 in,
- # the file also includes records of the form
- # <load_obj><obj>/usr/lib/libgcc_s.1.dylib</obj><ip>0x27000</ip></load_obj>
- # giving the filename and load address of each binary that was mapped
- # into the process.
-
- global TheAddressTable
- if self._use_gdb:
- TheAddressTable = gdb_helper.AddressTable()
- else:
- TheAddressTable = None
- cur_report_errors = set()
- suppcounts = defaultdict(int)
- badfiles = set()
-
- if self._analyze_start_time == None:
- self._analyze_start_time = time.time()
- start_time = self._analyze_start_time
-
- parse_failed = False
- for file in files:
- # Wait up to three minutes for valgrind to finish writing all files,
- # but after that, just skip incomplete files and warn.
- f = open(file, "r+")
- pid = re.match(".*\.([0-9]+)$", file)
- if pid:
- pid = pid.groups()[0]
- found = False
- running = True
- firstrun = True
- skip = False
- origsize = os.path.getsize(file)
- while (running and not found and not skip and
- (firstrun or
- ((time.time() - start_time) < self.LOG_COMPLETION_TIMEOUT))):
- firstrun = False
- f.seek(0)
- if pid:
- # Make sure the process is still running so we don't wait for
- # 3 minutes if it was killed. See http://crbug.com/17453
- ps_out = subprocess.Popen("ps p %s" % pid, shell=True,
- stdout=subprocess.PIPE).stdout
- if len(ps_out.readlines()) < 2:
- running = False
- else:
- skip = True
- running = False
- found = log_is_finished(f, False)
- if not running and not found:
- logging.warn("Valgrind process PID = %s is not running but its "
- "XML log has not been finished correctly.\n"
- "Make it up by adding some closing tags manually." % pid)
- found = log_is_finished(f, not running)
- if running and not found:
- time.sleep(1)
- f.close()
- if not found:
- badfiles.add(file)
- else:
- newsize = os.path.getsize(file)
- if origsize > newsize+1:
- logging.warn(str(origsize - newsize) +
- " bytes of junk were after </valgrindoutput> in %s!" %
- file)
- try:
- parsed_file = parse(file);
- except ExpatError, e:
- parse_failed = True
- logging.warn("could not parse %s: %s" % (file, e))
- lineno = e.lineno - 1
- context_lines = 5
- context_start = max(0, lineno - context_lines)
- context_end = lineno + context_lines + 1
- context_file = open(file, "r")
- for i in range(0, context_start):
- context_file.readline()
- for i in range(context_start, context_end):
- context_data = context_file.readline().rstrip()
- if i != lineno:
- logging.warn(" %s" % context_data)
- else:
- logging.warn("> %s" % context_data)
- context_file.close()
- continue
- if TheAddressTable != None:
- load_objs = parsed_file.getElementsByTagName("load_obj")
- for load_obj in load_objs:
- obj = getTextOf(load_obj, "obj")
- ip = getTextOf(load_obj, "ip")
- TheAddressTable.AddBinaryAt(obj, ip)
-
- commandline = None
- preamble = parsed_file.getElementsByTagName("preamble")[0];
- for node in preamble.getElementsByTagName("line"):
- if node.localName == "line":
- for x in node.childNodes:
- if x.nodeType == node.TEXT_NODE and "Command" in x.data:
- commandline = x.data
- break
-
- raw_errors = parsed_file.getElementsByTagName("error")
- for raw_error in raw_errors:
- # Ignore "possible" leaks for now by default.
- if (self._show_all_leaks or
- getTextOf(raw_error, "kind") != "Leak_PossiblyLost"):
- error = ValgrindError(self._source_dir,
- raw_error, commandline, testcase)
- if error not in cur_report_errors:
- # We haven't seen such errors doing this report yet...
- if error in self._errors:
- # ... but we saw it in earlier reports, e.g. previous UI test
- cur_report_errors.add("This error was already printed in "
- "some other test, see 'hash=#%016X#'" % \
- error.ErrorHash())
- else:
- # ... and we haven't seen it in other tests as well
- self._errors.add(error)
- cur_report_errors.add(error)
-
- suppcountlist = parsed_file.getElementsByTagName("suppcounts")
- if len(suppcountlist) > 0:
- suppcountlist = suppcountlist[0]
- for node in suppcountlist.getElementsByTagName("pair"):
- count = getTextOf(node, "count");
- name = getTextOf(node, "name");
- suppcounts[name] += int(count)
-
- if len(badfiles) > 0:
- logging.warn("valgrind didn't finish writing %d files?!" % len(badfiles))
- for file in badfiles:
- logging.warn("Last 20 lines of %s :" % file)
- os.system("tail -n 20 '%s' 1>&2" % file)
-
- if parse_failed:
- logging.error("FAIL! Couldn't parse Valgrind output file")
- return -2
-
- common.PrintUsedSuppressionsList(suppcounts)
-
- retcode = 0
- if cur_report_errors:
- logging.error("FAIL! There were %s errors: " % len(cur_report_errors))
-
- if TheAddressTable != None:
- TheAddressTable.ResolveAll()
-
- for error in cur_report_errors:
- logging.error(error)
-
- retcode = -1
-
- # Report tool's insanity even if there were errors.
- if check_sanity:
- remaining_sanity_supp = MemcheckAnalyzer.SANITY_TEST_SUPPRESSIONS
- for (name, count) in suppcounts.iteritems():
- # Workaround for http://crbug.com/334074
- if (name in remaining_sanity_supp and
- remaining_sanity_supp[name] <= count):
- del remaining_sanity_supp[name]
- if remaining_sanity_supp:
- logging.error("FAIL! Sanity check failed!")
- logging.info("The following test errors were not handled: ")
- for (name, count) in remaining_sanity_supp.iteritems():
- logging.info(" * %dx %s" % (count, name))
- retcode = -3
-
- if retcode != 0:
- return retcode
-
- logging.info("PASS! No errors found!")
- return 0
-
-
-def _main():
- '''For testing only. The MemcheckAnalyzer class should be imported instead.'''
- parser = optparse.OptionParser("usage: %prog [options] <files to analyze>")
- parser.add_option("", "--source-dir",
- help="path to top of source tree for this build"
- "(used to normalize source paths in baseline)")
-
- (options, args) = parser.parse_args()
- if len(args) == 0:
- parser.error("no filename specified")
- filenames = args
-
- analyzer = MemcheckAnalyzer(options.source_dir, use_gdb=True)
- return analyzer.Report(filenames, None)
-
-
-if __name__ == "__main__":
- sys.exit(_main())
diff --git a/files/tools_libyuv/valgrind/valgrind.sh b/files/tools_libyuv/valgrind/valgrind.sh
deleted file mode 100755
index 7f3f7926..00000000
--- a/files/tools_libyuv/valgrind/valgrind.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a small script for manually launching valgrind, along with passing
-# it the suppression file, and some helpful arguments (automatically attaching
-# the debugger on failures, etc). Run it from your repo root, something like:
-# $ sh ./tools/valgrind/valgrind.sh ./out/Debug/chrome
-#
-# This is mostly intended for running the chrome browser interactively.
-# To run unit tests, you probably want to run chrome_tests.sh instead.
-# That's the script used by the valgrind buildbot.
-
-export THISDIR=`dirname $0`
-
-setup_memcheck() {
- RUN_COMMAND="valgrind"
-
- # Prompt to attach gdb when there was an error detected.
- DEFAULT_TOOL_FLAGS=("--db-command=gdb -nw %f %p" "--db-attach=yes" \
- # Keep the registers in gdb in sync with the code.
- "--vex-iropt-register-updates=allregs-at-mem-access" \
- # Overwrite newly allocated or freed objects
- # with 0x41 to catch inproper use.
- "--malloc-fill=41" "--free-fill=41" \
- # Increase the size of stacks being tracked.
- "--num-callers=30")
-}
-
-setup_unknown() {
- echo "Unknown tool \"$TOOL_NAME\" specified, the result is not guaranteed"
- DEFAULT_TOOL_FLAGS=()
-}
-
-set -e
-
-if [ $# -eq 0 ]; then
- echo "usage: <command to run> <arguments ...>"
- exit 1
-fi
-
-TOOL_NAME="memcheck"
-declare -a DEFAULT_TOOL_FLAGS[0]
-
-# Select a tool different from memcheck with --tool=TOOL as a first argument
-TMP_STR=`echo $1 | sed 's/^\-\-tool=//'`
-if [ "$TMP_STR" != "$1" ]; then
- TOOL_NAME="$TMP_STR"
- shift
-fi
-
-if echo "$@" | grep "\-\-tool" ; then
- echo "--tool=TOOL must be the first argument" >&2
- exit 1
-fi
-
-case $TOOL_NAME in
- memcheck*) setup_memcheck "$1";;
- *) setup_unknown;;
-esac
-
-
-SUPPRESSIONS="$THISDIR/$TOOL_NAME/suppressions.txt"
-
-CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
-if [ "$CHROME_VALGRIND" = "" ]
-then
- # locate_valgrind.sh failed
- exit 1
-fi
-echo "Using valgrind binaries from ${CHROME_VALGRIND}"
-
-set -x
-PATH="${CHROME_VALGRIND}/bin:$PATH"
-# We need to set these variables to override default lib paths hard-coded into
-# Valgrind binary.
-export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
-export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
-
-# G_SLICE=always-malloc: make glib use system malloc
-# NSS_DISABLE_UNLOAD=1: make nss skip dlclosing dynamically loaded modules,
-# which would result in "obj:*" in backtraces.
-# NSS_DISABLE_ARENA_FREE_LIST=1: make nss use system malloc
-# G_DEBUG=fatal_warnings: make GTK abort on any critical or warning assertions.
-# If it crashes on you in the Options menu, you hit bug 19751,
-# comment out the G_DEBUG=fatal_warnings line.
-#
-# GTEST_DEATH_TEST_USE_FORK=1: make gtest death tests valgrind-friendly
-#
-# When everyone has the latest valgrind, we might want to add
-# --show-possibly-lost=no
-# to ignore possible but not definite leaks.
-
-G_SLICE=always-malloc \
-NSS_DISABLE_UNLOAD=1 \
-NSS_DISABLE_ARENA_FREE_LIST=1 \
-G_DEBUG=fatal_warnings \
-GTEST_DEATH_TEST_USE_FORK=1 \
-$RUN_COMMAND \
- --trace-children=yes \
- --leak-check=yes \
- --suppressions="$SUPPRESSIONS" \
- "${DEFAULT_TOOL_FLAGS[@]}" \
- "$@"
diff --git a/files/tools_libyuv/valgrind/valgrind_test.py b/files/tools_libyuv/valgrind/valgrind_test.py
deleted file mode 100755
index 0fd3d97f..00000000
--- a/files/tools_libyuv/valgrind/valgrind_test.py
+++ /dev/null
@@ -1,517 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Runs an exe through Valgrind and puts the intermediate files in a
-directory.
-"""
-
-import datetime
-import glob
-import logging
-import optparse
-import os
-import re
-import shutil
-import stat
-import subprocess
-import sys
-import tempfile
-
-import common
-
-import memcheck_analyze
-
-class BaseTool(object):
- """Abstract class for running dynamic error detection tools.
-
- Always subclass this and implement ToolCommand with framework- and
- tool-specific stuff.
- """
-
- def __init__(self):
- temp_parent_dir = None
- self.log_parent_dir = ""
- if common.IsWindows():
- # gpu process on Windows Vista+ runs at Low Integrity and can only
- # write to certain directories (http://crbug.com/119131)
- #
- # TODO(bruening): if scripts die in middle and don't clean up temp
- # dir, we'll accumulate files in profile dir. should remove
- # really old files automatically.
- profile = os.getenv("USERPROFILE")
- if profile:
- self.log_parent_dir = profile + "\\AppData\\LocalLow\\"
- if os.path.exists(self.log_parent_dir):
- self.log_parent_dir = common.NormalizeWindowsPath(self.log_parent_dir)
- temp_parent_dir = self.log_parent_dir
- # Generated every time (even when overridden)
- self.temp_dir = tempfile.mkdtemp(prefix="vg_logs_", dir=temp_parent_dir)
- self.log_dir = self.temp_dir # overridable by --keep_logs
- self.option_parser_hooks = []
- # TODO(glider): we may not need some of the env vars on some of the
- # platforms.
- self._env = {
- "G_SLICE" : "always-malloc",
- "NSS_DISABLE_UNLOAD" : "1",
- "NSS_DISABLE_ARENA_FREE_LIST" : "1",
- "GTEST_DEATH_TEST_USE_FORK": "1",
- }
-
- def ToolName(self):
- raise NotImplementedError, "This method should be implemented " \
- "in the tool-specific subclass"
-
- def Analyze(self, check_sanity=False):
- raise NotImplementedError, "This method should be implemented " \
- "in the tool-specific subclass"
-
- def RegisterOptionParserHook(self, hook):
- # Frameworks and tools can add their own flags to the parser.
- self.option_parser_hooks.append(hook)
-
- def CreateOptionParser(self):
- # Defines Chromium-specific flags.
- self._parser = optparse.OptionParser("usage: %prog [options] <program to "
- "test>")
- self._parser.disable_interspersed_args()
- self._parser.add_option("-t", "--timeout",
- dest="timeout", metavar="TIMEOUT", default=10000,
- help="timeout in seconds for the run (default 10000)")
- self._parser.add_option("", "--build-dir",
- help="the location of the compiler output")
- self._parser.add_option("", "--source-dir",
- help="path to top of source tree for this build"
- "(used to normalize source paths in baseline)")
- self._parser.add_option("", "--gtest_filter", default="",
- help="which test case to run")
- self._parser.add_option("", "--gtest_repeat",
- help="how many times to run each test")
- self._parser.add_option("", "--gtest_print_time", action="store_true",
- default=False,
- help="show how long each test takes")
- self._parser.add_option("", "--ignore_exit_code", action="store_true",
- default=False,
- help="ignore exit code of the test "
- "(e.g. test failures)")
- self._parser.add_option("", "--keep_logs", action="store_true",
- default=False,
- help="store memory tool logs in the <tool>.logs "
- "directory instead of /tmp.\nThis can be "
- "useful for tool developers/maintainers.\n"
- "Please note that the <tool>.logs directory "
- "will be clobbered on tool startup.")
-
- # To add framework- or tool-specific flags, please add a hook using
- # RegisterOptionParserHook in the corresponding subclass.
- # See ValgrindTool for an example.
- for hook in self.option_parser_hooks:
- hook(self, self._parser)
-
- def ParseArgv(self, args):
- self.CreateOptionParser()
-
- # self._tool_flags will store those tool flags which we don't parse
- # manually in this script.
- self._tool_flags = []
- known_args = []
-
- """ We assume that the first argument not starting with "-" is a program
- name and all the following flags should be passed to the program.
- TODO(timurrrr): customize optparse instead
- """
- while len(args) > 0 and args[0][:1] == "-":
- arg = args[0]
- if (arg == "--"):
- break
- if self._parser.has_option(arg.split("=")[0]):
- known_args += [arg]
- else:
- self._tool_flags += [arg]
- args = args[1:]
-
- if len(args) > 0:
- known_args += args
-
- self._options, self._args = self._parser.parse_args(known_args)
-
- self._timeout = int(self._options.timeout)
- self._source_dir = self._options.source_dir
- if self._options.keep_logs:
- # log_parent_dir has trailing slash if non-empty
- self.log_dir = self.log_parent_dir + "%s.logs" % self.ToolName()
- if os.path.exists(self.log_dir):
- shutil.rmtree(self.log_dir)
- os.mkdir(self.log_dir)
- logging.info("Logs are in " + self.log_dir)
-
- self._ignore_exit_code = self._options.ignore_exit_code
- if self._options.gtest_filter != "":
- self._args.append("--gtest_filter=%s" % self._options.gtest_filter)
- if self._options.gtest_repeat:
- self._args.append("--gtest_repeat=%s" % self._options.gtest_repeat)
- if self._options.gtest_print_time:
- self._args.append("--gtest_print_time")
-
- return True
-
- def Setup(self, args):
- return self.ParseArgv(args)
-
- def ToolCommand(self):
- raise NotImplementedError, "This method should be implemented " \
- "in the tool-specific subclass"
-
- def Cleanup(self):
- # You may override it in the tool-specific subclass
- pass
-
- def Execute(self):
- """ Execute the app to be tested after successful instrumentation.
- Full execution command-line provided by subclassers via proc."""
- logging.info("starting execution...")
- proc = self.ToolCommand()
- for var in self._env:
- common.PutEnvAndLog(var, self._env[var])
- return common.RunSubprocess(proc, self._timeout)
-
- def RunTestsAndAnalyze(self, check_sanity):
- exec_retcode = self.Execute()
- analyze_retcode = self.Analyze(check_sanity)
-
- if analyze_retcode:
- logging.error("Analyze failed.")
- logging.info("Search the log for '[ERROR]' to see the error reports.")
- return analyze_retcode
-
- if exec_retcode:
- if self._ignore_exit_code:
- logging.info("Test execution failed, but the exit code is ignored.")
- else:
- logging.error("Test execution failed.")
- return exec_retcode
- else:
- logging.info("Test execution completed successfully.")
-
- if not analyze_retcode:
- logging.info("Analysis completed successfully.")
-
- return 0
-
- def Main(self, args, check_sanity, min_runtime_in_seconds):
- """Call this to run through the whole process: Setup, Execute, Analyze"""
- start_time = datetime.datetime.now()
- retcode = -1
- if self.Setup(args):
- retcode = self.RunTestsAndAnalyze(check_sanity)
- shutil.rmtree(self.temp_dir, ignore_errors=True)
- self.Cleanup()
- else:
- logging.error("Setup failed")
- end_time = datetime.datetime.now()
- runtime_in_seconds = (end_time - start_time).seconds
- hours = runtime_in_seconds / 3600
- seconds = runtime_in_seconds % 3600
- minutes = seconds / 60
- seconds = seconds % 60
- logging.info("elapsed time: %02d:%02d:%02d" % (hours, minutes, seconds))
- if (min_runtime_in_seconds > 0 and
- runtime_in_seconds < min_runtime_in_seconds):
- logging.error("Layout tests finished too quickly. "
- "It should have taken at least %d seconds. "
- "Something went wrong?" % min_runtime_in_seconds)
- retcode = -1
- return retcode
-
- def Run(self, args, module, min_runtime_in_seconds=0):
- MODULES_TO_SANITY_CHECK = ["base"]
-
- check_sanity = module in MODULES_TO_SANITY_CHECK
- return self.Main(args, check_sanity, min_runtime_in_seconds)
-
-
-class ValgrindTool(BaseTool):
- """Abstract class for running Valgrind tools.
-
- Always subclass this and implement ToolSpecificFlags() and
- ExtendOptionParser() for tool-specific stuff.
- """
- def __init__(self):
- super(ValgrindTool, self).__init__()
- self.RegisterOptionParserHook(ValgrindTool.ExtendOptionParser)
-
- def UseXML(self):
- # Override if tool prefers nonxml output
- return True
-
- def ExtendOptionParser(self, parser):
- parser.add_option("", "--suppressions", default=[],
- action="append",
- help="path to a valgrind suppression file")
- parser.add_option("", "--indirect", action="store_true",
- default=False,
- help="set BROWSER_WRAPPER rather than "
- "running valgrind directly")
- parser.add_option("", "--indirect_webkit_layout", action="store_true",
- default=False,
- help="set --wrapper rather than running Dr. Memory "
- "directly.")
- parser.add_option("", "--trace_children", action="store_true",
- default=False,
- help="also trace child processes")
- parser.add_option("", "--num-callers",
- dest="num_callers", default=30,
- help="number of callers to show in stack traces")
- parser.add_option("", "--generate_dsym", action="store_true",
- default=False,
- help="Generate .dSYM file on Mac if needed. Slow!")
-
- def Setup(self, args):
- if not BaseTool.Setup(self, args):
- return False
- return True
-
- def ToolCommand(self):
- """Get the valgrind command to run."""
- # Note that self._args begins with the exe to be run.
- tool_name = self.ToolName()
-
- # Construct the valgrind command.
- if 'CHROME_VALGRIND' in os.environ:
- path = os.path.join(os.environ['CHROME_VALGRIND'], "bin", "valgrind")
- else:
- path = "valgrind"
- proc = [path, "--tool=%s" % tool_name]
-
- proc += ["--num-callers=%i" % int(self._options.num_callers)]
-
- if self._options.trace_children:
- proc += ["--trace-children=yes"]
- proc += ["--trace-children-skip='*dbus-daemon*'"]
- proc += ["--trace-children-skip='*dbus-launch*'"]
- proc += ["--trace-children-skip='*perl*'"]
- proc += ["--trace-children-skip='*python*'"]
- # This is really Python, but for some reason Valgrind follows it.
- proc += ["--trace-children-skip='*lsb_release*'"]
-
- proc += self.ToolSpecificFlags()
- proc += self._tool_flags
-
- suppression_count = 0
- for suppression_file in self._options.suppressions:
- if os.path.exists(suppression_file):
- suppression_count += 1
- proc += ["--suppressions=%s" % suppression_file]
-
- if not suppression_count:
- logging.warning("WARNING: NOT USING SUPPRESSIONS!")
-
- logfilename = self.log_dir + ("/%s." % tool_name) + "%p"
- if self.UseXML():
- proc += ["--xml=yes", "--xml-file=" + logfilename]
- else:
- proc += ["--log-file=" + logfilename]
-
- # The Valgrind command is constructed.
-
- # Handle --indirect_webkit_layout separately.
- if self._options.indirect_webkit_layout:
- # Need to create the wrapper before modifying |proc|.
- wrapper = self.CreateBrowserWrapper(proc, webkit=True)
- proc = self._args
- proc.append("--wrapper")
- proc.append(wrapper)
- return proc
-
- if self._options.indirect:
- wrapper = self.CreateBrowserWrapper(proc)
- os.environ["BROWSER_WRAPPER"] = wrapper
- logging.info('export BROWSER_WRAPPER=' + wrapper)
- proc = []
- proc += self._args
- return proc
-
- def ToolSpecificFlags(self):
- raise NotImplementedError, "This method should be implemented " \
- "in the tool-specific subclass"
-
- def CreateBrowserWrapper(self, proc, webkit=False):
- """The program being run invokes Python or something else that can't stand
- to be valgrinded, and also invokes the Chrome browser. In this case, use a
- magic wrapper to only valgrind the Chrome browser. Build the wrapper here.
- Returns the path to the wrapper. It's up to the caller to use the wrapper
- appropriately.
- """
- command = " ".join(proc)
- # Add the PID of the browser wrapper to the logfile names so we can
- # separate log files for different UI tests at the analyze stage.
- command = command.replace("%p", "$$.%p")
-
- (fd, indirect_fname) = tempfile.mkstemp(dir=self.log_dir,
- prefix="browser_wrapper.",
- text=True)
- f = os.fdopen(fd, "w")
- f.write('#!/bin/bash\n'
- 'echo "Started Valgrind wrapper for this test, PID=$$" >&2\n')
-
- f.write('DIR=`dirname $0`\n'
- 'TESTNAME_FILE=$DIR/testcase.$$.name\n\n')
-
- if webkit:
- # Webkit layout_tests pass the URL as the first line of stdin.
- f.write('tee $TESTNAME_FILE | %s "$@"\n' % command)
- else:
- # Try to get the test case name by looking at the program arguments.
- # i.e. Chromium ui_tests used --test-name arg.
- # TODO(timurrrr): This doesn't handle "--test-name Test.Name"
- # TODO(timurrrr): ui_tests are dead. Where do we use the non-webkit
- # wrapper now? browser_tests? What do they do?
- f.write('for arg in $@\ndo\n'
- ' if [[ "$arg" =~ --test-name=(.*) ]]\n then\n'
- ' echo ${BASH_REMATCH[1]} >$TESTNAME_FILE\n'
- ' fi\n'
- 'done\n\n'
- '%s "$@"\n' % command)
-
- f.close()
- os.chmod(indirect_fname, stat.S_IRUSR|stat.S_IXUSR)
- return indirect_fname
-
- def CreateAnalyzer(self):
- raise NotImplementedError, "This method should be implemented " \
- "in the tool-specific subclass"
-
- def GetAnalyzeResults(self, check_sanity=False):
- # Glob all the files in the log directory
- filenames = glob.glob(self.log_dir + "/" + self.ToolName() + ".*")
-
- # If we have browser wrapper, the logfiles are named as
- # "toolname.wrapper_PID.valgrind_PID".
- # Let's extract the list of wrapper_PIDs and name it ppids
- ppids = set([int(f.split(".")[-2]) \
- for f in filenames if re.search("\.[0-9]+\.[0-9]+$", f)])
-
- analyzer = self.CreateAnalyzer()
- if len(ppids) == 0:
- # Fast path - no browser wrapper was set.
- return analyzer.Report(filenames, None, check_sanity)
-
- ret = 0
- for ppid in ppids:
- testcase_name = None
- try:
- f = open(self.log_dir + ("/testcase.%d.name" % ppid))
- testcase_name = f.read().strip()
- f.close()
- wk_layout_prefix="third_party/WebKit/LayoutTests/"
- wk_prefix_at = testcase_name.rfind(wk_layout_prefix)
- if wk_prefix_at != -1:
- testcase_name = testcase_name[wk_prefix_at + len(wk_layout_prefix):]
- except IOError:
- pass
- print "====================================================="
- print " Below is the report for valgrind wrapper PID=%d." % ppid
- if testcase_name:
- print " It was used while running the `%s` test." % testcase_name
- else:
- print " You can find the corresponding test"
- print " by searching the above log for 'PID=%d'" % ppid
- sys.stdout.flush()
-
- ppid_filenames = [f for f in filenames \
- if re.search("\.%d\.[0-9]+$" % ppid, f)]
- # check_sanity won't work with browser wrappers
- assert check_sanity == False
- ret |= analyzer.Report(ppid_filenames, testcase_name)
- print "====================================================="
- sys.stdout.flush()
-
- if ret != 0:
- print ""
- print "The Valgrind reports are grouped by test names."
- print "Each test has its PID printed in the log when the test was run"
- print "and at the beginning of its Valgrind report."
- print "Hint: you can search for the reports by Ctrl+F -> `=#`"
- sys.stdout.flush()
-
- return ret
-
-
-# TODO(timurrrr): Split into a separate file.
-class Memcheck(ValgrindTool):
- """Memcheck
- Dynamic memory error detector for Linux & Mac
-
- http://valgrind.org/info/tools.html#memcheck
- """
-
- def __init__(self):
- super(Memcheck, self).__init__()
- self.RegisterOptionParserHook(Memcheck.ExtendOptionParser)
-
- def ToolName(self):
- return "memcheck"
-
- def ExtendOptionParser(self, parser):
- parser.add_option("--leak-check", "--leak_check", type="string",
- default="yes", # --leak-check=yes is equivalent of =full
- help="perform leak checking at the end of the run")
- parser.add_option("", "--show_all_leaks", action="store_true",
- default=False,
- help="also show less blatant leaks")
- parser.add_option("", "--track_origins", action="store_true",
- default=False,
- help="Show whence uninitialized bytes came. 30% slower.")
-
- def ToolSpecificFlags(self):
- ret = ["--gen-suppressions=all", "--demangle=no"]
- ret += ["--leak-check=%s" % self._options.leak_check]
-
- if self._options.show_all_leaks:
- ret += ["--show-reachable=yes"]
- else:
- ret += ["--show-possibly-lost=no"]
-
- if self._options.track_origins:
- ret += ["--track-origins=yes"]
-
- # TODO(glider): this is a temporary workaround for http://crbug.com/51716
- # Let's see whether it helps.
- if common.IsMac():
- ret += ["--smc-check=all"]
-
- return ret
-
- def CreateAnalyzer(self):
- use_gdb = common.IsMac()
- return memcheck_analyze.MemcheckAnalyzer(self._source_dir,
- self._options.show_all_leaks,
- use_gdb=use_gdb)
-
- def Analyze(self, check_sanity=False):
- ret = self.GetAnalyzeResults(check_sanity)
-
- if ret != 0:
- logging.info("Please see http://dev.chromium.org/developers/how-tos/"
- "using-valgrind for the info on Memcheck/Valgrind")
- return ret
-
-
-class ToolFactory:
- def Create(self, tool_name):
- if tool_name == "memcheck":
- return Memcheck()
- try:
- platform_name = common.PlatformNames()[0]
- except common.NotImplementedError:
- platform_name = sys.platform + "(Unknown)"
- raise RuntimeError, "Unknown tool (tool=%s, platform=%s)" % (tool_name,
- platform_name)
-
-def CreateTool(tool):
- return ToolFactory().Create(tool)
diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc
deleted file mode 100644
index 32a4cd1c..00000000
--- a/files/unit_test/convert_test.cc
+++ /dev/null
@@ -1,3223 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "../unit_test/unit_test.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/video_common.h"
-
-#if defined(__arm__) || defined(__aarch64__)
-// arm version subsamples by summing 4 pixels then multiplying by matrix with
-// 4x smaller coefficients which are rounded to nearest integer.
-#define ARM_YUV_ERROR 4
-#else
-#define ARM_YUV_ERROR 0
-#endif
-
-namespace libyuv {
-
-// Alias to copy pixels as is
-#define AR30ToAR30 ARGBCopy
-#define ABGRToABGR ARGBCopy
-
-#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
-
-// Planar test
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
- SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
- DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
- static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
- static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
- "DST SRC_SUBSAMP_X unsupported"); \
- static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
- "DST SRC_SUBSAMP_Y unsupported"); \
- static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
- "DST DST_SUBSAMP_X unsupported"); \
- static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
- "DST DST_SUBSAMP_Y unsupported"); \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
- const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
- const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
- const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
- align_buffer_page_end(src_u, \
- kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
- align_buffer_page_end(src_v, \
- kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
- align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
- align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
- MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
- MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
- memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
- memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
- memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
- reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
- reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
- reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
- reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
- reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
- NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
- reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
- reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
- reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
- reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
- reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
- NEG kHeight); \
- } \
- for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
- EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
- } \
- for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \
- EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \
- EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \
- } \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- }
-
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
- SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
- DST_SUBSAMP_X, DST_SUBSAMP_Y) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
-
-// Test Android 420 to I420
-#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
- SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \
- TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kSizeUV = \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_uv, \
- kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- uint8_t* src_u = src_uv + OFF_U; \
- uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
- int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
- (fastrand() & 0xff); \
- src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_u_c, 2, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_u_opt, 102, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
- kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \
- dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_EQ(0, max_diff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 3); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 3); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_uv); \
- }
-
-#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \
- SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \
- SUBSAMP_Y) \
- TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, \
- _Any, +, 0, PN, OFF_U, OFF_V) \
- TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \
- _Unaligned, +, 1, PN, OFF_U, OFF_V) \
- TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
- -, 0, PN, OFF_U, OFF_V) \
- TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
- 0, PN, OFF_U, OFF_V)
-
-TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
-TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
-TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
-
-// wrapper to keep API the same
-int I400ToNV21(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* /* src_u */,
- int /* src_stride_u */,
- const uint8_t* /* src_v */,
- int /* src_stride_v */,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
- int width,
- int height) {
- return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
- dst_stride_vu, width, height);
-}
-
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
- OFF); \
- align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
- OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_uv_c, 2, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_uv_opt, 102, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
- dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>( \
- dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
- static_cast<int>( \
- dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_uv_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_uv_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- }
-
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
-TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
-TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
-TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
-TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
-
-#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \
- OFF) \
- TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_uv, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
- OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 0 + OFF] = \
- (fastrand() & 0xff); \
- src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 1 + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_uv_c, 2, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_uv_opt, 102, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_uv + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_c, kWidth, dst_uv_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_uv + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_opt, kWidth, dst_uv_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>( \
- dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
- static_cast<int>( \
- dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_uv_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_uv_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_uv); \
- }
-
-#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) \
- TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
- TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width, _Unaligned, +, 1) \
- TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
- TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
-
-// TODO(fbarchard): Fix msan on this unittest
-// TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
- DOY) \
- TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
- OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_u_c, 2, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_u_opt, 102, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \
- kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- if (DOY) { \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_uv); \
- }
-
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \
- 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
-
-TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
-
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- double time0 = get_time(); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
- kWidth, NEG kHeight); \
- double time1 = get_time(); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
- kStrideB, kWidth, NEG kHeight); \
- } \
- double time2 = get_time(); \
- printf(" %8d us C - %8d us OPT\n", \
- static_cast<int>((time1 - time0) * 1e6), \
- static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_)); \
- for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \
- } \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
-TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
-TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
-TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
-TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
-TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
-TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
-TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
-TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
-TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
-TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
-TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
-TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
-TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
-TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
-TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
-
-#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- src_a[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
- dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \
- ATTEN); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
- dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
- ATTEN); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) - \
- static_cast<int>(dst_argb_opt[i + OFF])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(src_a); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
-
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
-TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \
- BPP_B, W1280, DIFF, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = kWidth * BPP_B; \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_uv, \
- kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV * 2; ++j) { \
- src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \
- } \
- } \
- memset(dst_argb_c, 1, kStrideB* kHeight); \
- memset(dst_argb_opt, 101, kStrideB* kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
- dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
- dst_argb_opt, kWidth * BPP_B, kWidth, \
- NEG kHeight); \
- } \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \
- align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
- memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
- memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
- FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
- kHeight); \
- FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
- kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth * 4; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
- static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_uv); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- free_aligned_buffer_page_end(dst_argb32_c); \
- free_aligned_buffer_page_end(dst_argb32_opt); \
- }
-
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
- DIFF) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9)
-TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
-
-#ifdef DO_THREE_PLANES
-// Do 3 allocations for yuv. conventional but slower.
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, DIFF, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
- align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_u_opt, \
- kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, \
- kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c, \
- kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
- dst_u_opt, kStrideUV, dst_v_opt, kStrideUV, \
- kWidth, NEG kHeight); \
- } \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
- static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
- } \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \
- static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \
- } \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]), \
- static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF); \
- } \
- } \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_argb); \
- }
-#else
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, DIFF, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
- align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_c, \
- kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_opt, \
- kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
- kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
- dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
- kStrideUV * 2, kWidth, NEG kHeight); \
- } \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
- static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
- } \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]), \
- static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF); \
- } \
- } \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_uv_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_uv_opt); \
- free_aligned_buffer_page_end(src_argb); \
- }
-#endif
-
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- DIFF) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
-// TODO(fbarchard): Investigate J420 error of 11 on Windows.
-TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 11)
-TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
-
-#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \
- SUBSAMP_Y, W1280, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_c, \
- kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_opt, \
- kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
- kStrideUV * 2, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
- dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \
- static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_uv_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_uv_opt); \
- free_aligned_buffer_page_end(src_argb); \
- }
-
-#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
-TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
-
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = \
- (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = \
- (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 1, kStrideB* kHeightB); \
- memset(dst_argb_opt, 101, kStrideB* kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \
- NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
- STRIDE_B, HEIGHT_B, DIFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \
- for (int times = 0; times < benchmark_iterations_; ++times) { \
- const int kWidth = (fastrand() & 63) + 1; \
- const int kHeight = (fastrand() & 31) + 1; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = \
- (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = \
- (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb, kStrideA* kHeightA); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 123, kStrideB* kHeightB); \
- memset(dst_argb_opt, 123, kStrideB* kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth, \
- kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
- kHeight); \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \
- } \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- } \
- }
-
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
- TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF)
-
-// TODO(fbarchard): make ARM version of C code that matches NEON.
-TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
-
-#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = \
- (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = \
- (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 1, kStrideB* kHeightB); \
- memset(dst_argb_opt, 101, kStrideB* kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
- NULL, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \
- kStrideB, NULL, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
- STRIDE_B, HEIGHT_B, DIFF) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \
- for (int times = 0; times < benchmark_iterations_; ++times) { \
- const int kWidth = (fastrand() & 63) + 1; \
- const int kHeight = (fastrand() & 31) + 1; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = \
- (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = \
- (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb, kStrideA* kHeightA); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 123, kStrideB* kHeightB); \
- memset(dst_argb_opt, 123, kStrideB* kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
- kWidth, kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \
- NULL, kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- } \
- }
-
-#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
- TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
- HEIGHT_B, DIFF)
-
-TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-
-#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \
- TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kStrideA = \
- (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideA* kHeightA); \
- align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 1, kStrideA* kHeightA); \
- memset(dst_argb_opt, 101, kStrideA* kHeightA); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth, \
- NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth, \
- NEG kHeight); \
- } \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth, \
- NEG kHeight); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \
- EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
- } \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \
- TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \
- 0) \
- TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned, \
- +, 1) \
- TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0)
-
-TESTSYM(ARGBToARGB, 4, 4, 1)
-TESTSYM(ARGBToBGRA, 4, 4, 1)
-TESTSYM(ARGBToABGR, 4, 4, 1)
-TESTSYM(BGRAToARGB, 4, 4, 1)
-TESTSYM(ABGRToARGB, 4, 4, 1)
-
-TEST_F(LibYUVConvertTest, Test565) {
- SIMD_ALIGNED(uint8_t orig_pixels[256][4]);
- SIMD_ALIGNED(uint8_t pixels565[256][2]);
-
- for (int i = 0; i < 256; ++i) {
- for (int j = 0; j < 4; ++j) {
- orig_pixels[i][j] = i;
- }
- }
- ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
- uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
- EXPECT_EQ(610919429u, checksum);
-}
-
-#ifdef HAVE_JPEG
-TEST_F(LibYUVConvertTest, ValidateJpeg) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_page_end(orig_pixels, kSize);
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kSize);
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- // Test special value that matches marker start.
- memset(orig_pixels, 0xff, kSize);
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- // EOI, SOI. Expect pass.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[2] = 0xff;
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
- }
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- const int kMultiple = 10;
- const int kBufSize = kImageSize * kMultiple + kOff;
- align_buffer_page_end(orig_pixels, kBufSize);
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kBufSize);
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
-
- // EOI, SOI. Expect pass.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[2] = 0xff;
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
- }
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, InvalidateJpeg) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_page_end(orig_pixels, kSize);
-
- // NULL pointer. Expect fail.
- EXPECT_FALSE(ValidateJpeg(NULL, kSize));
-
- // Negative size. Expect fail.
- EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
-
- // Too large size. Expect fail.
- EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kSize);
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- // SOI but no EOI. Expect fail.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[2] = 0xff;
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
- }
-
- // EOI but no SOI. Expect fail.
- orig_pixels[0] = 0;
- orig_pixels[1] = 0;
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, FuzzJpeg) {
- // SOI but no EOI. Expect fail.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- const int kSize = fastrand() % 5000 + 3;
- align_buffer_page_end(orig_pixels, kSize);
- MemRandomize(orig_pixels, kSize);
-
- // Add SOI so frame will be scanned.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[2] = 0xff;
- orig_pixels[kSize - 1] = 0xff;
- ValidateJpeg(orig_pixels,
- kSize); // Failure normally expected.
- free_aligned_buffer_page_end(orig_pixels);
- }
-}
-
-// Test data created in GIMP. In export jpeg, disable
-// thumbnails etc, choose a subsampling, and use low quality
-// (50) to keep size small. Generated with xxd -i test.jpg
-// test 0 is J400
-static const uint8_t kTest0Jpg[] = {
- 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
- 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
- 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
- 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
- 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
- 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
- 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
- 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
- 0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
- 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
- 0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
- 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
- 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
- 0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
- 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
- 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
- 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
- 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
- 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
- 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
- 0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
- 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
- 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
- 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
- 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
- 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
- 0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
- 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
- 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
- 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
- 0xd9};
-static const size_t kTest0JpgLen = 421;
-
-// test 1 is J444
-static const uint8_t kTest1Jpg[] = {
- 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
- 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
- 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
- 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
- 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
- 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
- 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
- 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
- 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
- 0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
- 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
- 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
- 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
- 0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
- 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
- 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
- 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
- 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
- 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
- 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
- 0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
- 0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
- 0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
- 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
- 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
- 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
- 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
- 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
- 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
- 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
- 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
- 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
- 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
- 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
- 0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
- 0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
- 0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
- 0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
- 0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
- 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
- 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
- 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
- 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
- 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
- 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
- 0xd4, 0xff, 0xd9};
-static const size_t kTest1JpgLen = 735;
-
-// test 2 is J420
-static const uint8_t kTest2Jpg[] = {
- 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
- 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
- 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
- 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
- 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
- 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
- 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
- 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
- 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
- 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
- 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
- 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
- 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
- 0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
- 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
- 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
- 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
- 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
- 0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
- 0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
- 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
- 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
- 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
- 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
- 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
- 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
- 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
- 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
- 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
- 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
- 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
- 0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
- 0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
- 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
- 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
- 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
- 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
- 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
- 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
- 0xd9};
-static const size_t kTest2JpgLen = 685;
-
-// test 3 is J422
-static const uint8_t kTest3Jpg[] = {
- 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
- 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
- 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
- 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
- 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
- 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
- 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
- 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
- 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
- 0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
- 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
- 0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
- 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
- 0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
- 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
- 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
- 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
- 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
- 0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
- 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
- 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
- 0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
- 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
- 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
- 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
- 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
- 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
- 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
- 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
- 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
- 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
- 0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
- 0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
- 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
- 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
- 0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
- 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
- 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
- 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
- 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
- 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
- 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
- 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
-static const size_t kTest3JpgLen = 704;
-
-// test 4 is J422 vertical - not supported
-static const uint8_t kTest4Jpg[] = {
- 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
- 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
- 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
- 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
- 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
- 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
- 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
- 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
- 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
- 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
- 0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
- 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
- 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
- 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
- 0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
- 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
- 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
- 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
- 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
- 0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
- 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
- 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
- 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
- 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
- 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
- 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
- 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
- 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
- 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
- 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
- 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
- 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
- 0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
- 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
- 0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
- 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
- 0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
- 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
- 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
- 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
- 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
- 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
- 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
- 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
-static const size_t kTest4JpgLen = 701;
-
-TEST_F(LibYUVConvertTest, TestMJPGSize) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- printf("test jpeg size %d x %d\n", width, height);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int half_width = (width + 1) / 2;
- int half_height = (height + 1) / 2;
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- align_buffer_page_end(dst_y, width * height);
- align_buffer_page_end(dst_u, half_width * half_height);
- align_buffer_page_end(dst_v, half_width * half_height);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
- dst_v, half_width, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Test result matches known hash value.
- uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
- uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
- uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
- EXPECT_EQ(dst_y_hash, 2682851208u);
- EXPECT_EQ(dst_u_hash, 2501859930u);
- EXPECT_EQ(dst_v_hash, 2126459123u);
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_u);
- free_aligned_buffer_page_end(dst_v);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int half_width = (width + 1) / 2;
- int half_height = (height + 1) / 2;
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- // Convert to NV21
- align_buffer_page_end(dst_y, width * height);
- align_buffer_page_end(dst_vu, half_width * half_height * 2);
-
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
- half_width * 2, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Convert to I420
- align_buffer_page_end(dst2_y, width * height);
- align_buffer_page_end(dst2_u, half_width * half_height);
- align_buffer_page_end(dst2_v, half_width * half_height);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
- dst2_v, half_width, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Convert I420 to NV21
- align_buffer_page_end(dst3_y, width * height);
- align_buffer_page_end(dst3_vu, half_width * half_height * 2);
-
- I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
- width, dst3_vu, half_width * 2, width, height);
-
- for (int i = 0; i < width * height; ++i) {
- EXPECT_EQ(dst_y[i], dst3_y[i]);
- }
- for (int i = 0; i < half_width * half_height * 2; ++i) {
- EXPECT_EQ(dst_vu[i], dst3_vu[i]);
- EXPECT_EQ(dst_vu[i], dst3_vu[i]);
- }
-
- free_aligned_buffer_page_end(dst3_y);
- free_aligned_buffer_page_end(dst3_vu);
-
- free_aligned_buffer_page_end(dst2_y);
- free_aligned_buffer_page_end(dst2_u);
- free_aligned_buffer_page_end(dst2_v);
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_vu);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int half_width = (width + 1) / 2;
- int half_height = (height + 1) / 2;
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- align_buffer_page_end(dst_y, width * height);
- align_buffer_page_end(dst_uv, half_width * half_height * 2);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
- half_width * 2, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Test result matches known hash value.
- uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
- uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
- EXPECT_EQ(dst_y_hash, 2682851208u);
- EXPECT_EQ(dst_uv_hash, 1069662856u);
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int half_width = (width + 1) / 2;
- int half_height = (height + 1) / 2;
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- align_buffer_page_end(dst_y, width * height);
- align_buffer_page_end(dst_uv, half_width * half_height * 2);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
- half_width * 2, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Test result matches known hash value.
- uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
- uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
- EXPECT_EQ(dst_y_hash, 2682851208u);
- EXPECT_EQ(dst_uv_hash, 3543430771u);
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int half_width = (width + 1) / 2;
- int half_height = (height + 1) / 2;
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- align_buffer_page_end(dst_y, width * height);
- align_buffer_page_end(dst_uv, half_width * half_height * 2);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
- half_width * 2, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Test result matches known hash value.
- uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
- uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
- EXPECT_EQ(dst_y_hash, 330644005u);
- EXPECT_EQ(dst_uv_hash, 135214341u);
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int half_width = (width + 1) / 2;
- int half_height = (height + 1) / 2;
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- align_buffer_page_end(dst_y, width * height);
- align_buffer_page_end(dst_uv, half_width * half_height * 2);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
- half_width * 2, width, height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Test result matches known hash value.
- uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
- uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
- EXPECT_EQ(dst_y_hash, 2682851208u);
- EXPECT_EQ(dst_uv_hash, 506143297u);
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
- int width = 0;
- int height = 0;
- int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
- EXPECT_EQ(0, ret);
-
- int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
- benchmark_height_ / (width * height);
-
- align_buffer_page_end(dst_argb, width * height * 4);
- for (int times = 0; times < benchmark_iterations; ++times) {
- ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
- height, width, height);
- }
- // Expect sucesss
- EXPECT_EQ(0, ret);
-
- // Test result matches known hash value.
- uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
- EXPECT_EQ(dst_argb_hash, 2355976473u);
-
- free_aligned_buffer_page_end(dst_argb);
-}
-
-static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
- MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-
- int width = mjpeg_decoder.GetWidth();
- int height = mjpeg_decoder.GetHeight();
-
- // YUV420
- if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 2 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- printf("JPeg is J420, %dx%d %d bytes\n", width, height,
- static_cast<int>(sample_size));
- // YUV422
- } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- printf("JPeg is J422, %dx%d %d bytes\n", width, height,
- static_cast<int>(sample_size));
- // YUV444
- } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- printf("JPeg is J444, %dx%d %d bytes\n", width, height,
- static_cast<int>(sample_size));
- // YUV400
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceGrayscale &&
- mjpeg_decoder.GetNumComponents() == 1 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- printf("JPeg is J400, %dx%d %d bytes\n", width, height,
- static_cast<int>(sample_size));
- } else {
- // Unknown colorspace.
- printf("JPeg is Unknown colorspace.\n");
- }
- mjpeg_decoder.UnloadFrame();
- return ret;
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGInfo) {
- EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
- EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
- EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
- EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
- EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
- kTest4JpgLen)); // Valid but unsupported.
-}
-#endif // HAVE_JPEG
-
-TEST_F(LibYUVConvertTest, NV12Crop) {
- const int SUBSAMP_X = 2;
- const int SUBSAMP_Y = 2;
- const int kWidth = benchmark_width_;
- const int kHeight = benchmark_height_;
- const int crop_y =
- ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
- const int kDestWidth = benchmark_width_;
- const int kDestHeight = benchmark_height_ - crop_y * 2;
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
- const int sample_size =
- kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
- align_buffer_page_end(src_y, sample_size);
- uint8_t* src_uv = src_y + kWidth * kHeight;
-
- align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
- align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
- align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
- align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
- for (int i = 0; i < kHeight * kWidth; ++i) {
- src_y[i] = (fastrand() & 0xff);
- }
- for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
- src_uv[i] = (fastrand() & 0xff);
- }
- memset(dst_y, 1, kDestWidth * kDestHeight);
- memset(dst_u, 2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- memset(dst_v, 3,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- memset(dst_y_2, 1, kDestWidth * kDestHeight);
- memset(dst_u_2, 2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- memset(dst_v_2, 3,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
- ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
- kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
-
- NV12ToI420(src_y + crop_y * kWidth, kWidth,
- src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
- kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
- SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
-
- for (int i = 0; i < kDestHeight; ++i) {
- for (int j = 0; j < kDestWidth; ++j) {
- EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
- }
- }
- for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
- for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
- EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
- dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
- }
- }
- for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
- for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
- EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
- dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
- }
- }
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_u);
- free_aligned_buffer_page_end(dst_v);
- free_aligned_buffer_page_end(dst_y_2);
- free_aligned_buffer_page_end(dst_u_2);
- free_aligned_buffer_page_end(dst_v_2);
- free_aligned_buffer_page_end(src_y);
-}
-
-TEST_F(LibYUVConvertTest, I420CropOddY) {
- const int SUBSAMP_X = 2;
- const int SUBSAMP_Y = 2;
- const int kWidth = benchmark_width_;
- const int kHeight = benchmark_height_;
- const int crop_y = 1;
- const int kDestWidth = benchmark_width_;
- const int kDestHeight = benchmark_height_ - crop_y * 2;
- const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
- const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
- const int sample_size = kWidth * kHeight +
- kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
- kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
- align_buffer_page_end(src_y, sample_size);
- uint8_t* src_u = src_y + kWidth * kHeight;
- uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
-
- align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
- align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
- for (int i = 0; i < kHeight * kWidth; ++i) {
- src_y[i] = (fastrand() & 0xff);
- }
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
- src_u[i] = (fastrand() & 0xff);
- }
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
- src_v[i] = (fastrand() & 0xff);
- }
- memset(dst_y, 1, kDestWidth * kDestHeight);
- memset(dst_u, 2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- memset(dst_v, 3,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
- MaskCpuFlags(benchmark_cpu_info_);
- for (int i = 0; i < benchmark_iterations_; ++i) {
- ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
- SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
- SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
- kDestWidth, kDestHeight, libyuv::kRotate0,
- libyuv::FOURCC_I420);
- }
-
- for (int i = 0; i < kDestHeight; ++i) {
- for (int j = 0; j < kDestWidth; ++j) {
- EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
- dst_y[i * kDestWidth + j]);
- }
- }
- for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
- for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
- EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
- dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
- }
- }
- for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
- for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
- EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
- dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
- }
- }
-
- free_aligned_buffer_page_end(dst_y);
- free_aligned_buffer_page_end(dst_u);
- free_aligned_buffer_page_end(dst_v);
- free_aligned_buffer_page_end(src_y);
-}
-
-TEST_F(LibYUVConvertTest, TestYToARGB) {
- uint8_t y[32];
- uint8_t expectedg[32];
- for (int i = 0; i < 32; ++i) {
- y[i] = i * 5 + 17;
- expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
- }
- uint8_t argb[32 * 4];
- YToARGB(y, 0, argb, 0, 32, 1);
-
- for (int i = 0; i < 32; ++i) {
- printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
- argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
- }
- for (int i = 0; i < 32; ++i) {
- EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
- }
-}
-
-static const uint8_t kNoDither4x4[16] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-TEST_F(LibYUVConvertTest, TestNoDither) {
- align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
- align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
- align_buffer_page_end(dst_rgb565dither,
- benchmark_width_ * benchmark_height_ * 2);
- MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
- MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
- MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
- ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
- benchmark_width_, benchmark_height_);
- ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
- benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
- benchmark_height_);
- for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
- EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
- }
-
- free_aligned_buffer_page_end(src_argb);
- free_aligned_buffer_page_end(dst_rgb565);
- free_aligned_buffer_page_end(dst_rgb565dither);
-}
-
-// Ordered 4x4 dither for 888 to 565. Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
- 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-TEST_F(LibYUVConvertTest, TestDither) {
- align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
- align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
- align_buffer_page_end(dst_rgb565dither,
- benchmark_width_ * benchmark_height_ * 2);
- align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4);
- align_buffer_page_end(dst_argbdither,
- benchmark_width_ * benchmark_height_ * 4);
- MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
- MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
- MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
- MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
- MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
- ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
- benchmark_width_, benchmark_height_);
- ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
- benchmark_width_ * 2, kDither565_4x4, benchmark_width_,
- benchmark_height_);
- RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
- RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither,
- benchmark_width_ * 4, benchmark_width_, benchmark_height_);
-
- for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
- EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
- }
- free_aligned_buffer_page_end(src_argb);
- free_aligned_buffer_page_end(dst_rgb565);
- free_aligned_buffer_page_end(dst_rgb565dither);
- free_aligned_buffer_page_end(dst_argb);
- free_aligned_buffer_page_end(dst_argbdither);
-}
-
-#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_c + OFF, \
- kStrideB, NULL, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B##Dither( \
- src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
- dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
- align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
- memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
- memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
- FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
- kWidth, kHeight); \
- FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
- kWidth * BPP_C, kWidth, kHeight); \
- for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- free_aligned_buffer_page_end(dst_argb32_c); \
- free_aligned_buffer_page_end(dst_argb32_opt); \
- }
-
-#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, \
- BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
- BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
-
-#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \
- TEST_F(LibYUVConvertTest, NAME) { \
- const int kWidth = benchmark_width_; \
- const int kHeight = benchmark_height_; \
- \
- align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
- align_buffer_page_end(orig_y, kWidth* kHeight); \
- align_buffer_page_end(orig_u, \
- SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
- align_buffer_page_end(orig_v, \
- SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
- \
- align_buffer_page_end(dst_y_orig, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_orig, \
- 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
- \
- align_buffer_page_end(dst_y, kWidth* kHeight); \
- align_buffer_page_end(dst_uv, \
- 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
- \
- MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
- \
- /* Convert UYVY to NV12 in 2 steps for reference */ \
- libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth, \
- orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
- SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
- libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
- SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
- 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
- \
- /* Convert to NV12 */ \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth, \
- dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
- } \
- \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- EXPECT_EQ(orig_y[i], dst_y[i]); \
- } \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- EXPECT_EQ(dst_y_orig[i], dst_y[i]); \
- } \
- for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); \
- ++i) { \
- EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \
- } \
- \
- free_aligned_buffer_page_end(orig_uyvy); \
- free_aligned_buffer_page_end(orig_y); \
- free_aligned_buffer_page_end(orig_u); \
- free_aligned_buffer_page_end(orig_v); \
- free_aligned_buffer_page_end(dst_y_orig); \
- free_aligned_buffer_page_end(dst_uv_orig); \
- free_aligned_buffer_page_end(dst_y); \
- free_aligned_buffer_page_end(dst_uv); \
- }
-
-TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
-TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
-
-// Transitive tests. A to B to C is same as A to C.
-
-#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- W1280, N, NEG, OFF, FMT_C, BPP_C) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_b + OFF, \
- kStrideB, kWidth, NEG kHeight); \
- } \
- /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
- const int kStrideC = kWidth * BPP_C; \
- align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
- align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
- memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
- memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
- FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \
- kWidth, NEG kHeight); \
- /* Convert B to C */ \
- FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
- kWidth, kHeight); \
- for (int i = 0; i < kStrideC * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
- } \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_b); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_bc); \
- }
-
-#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
-TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
-TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
-TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
-TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
-TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
-TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
-TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
-TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
-
-#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
- const int kSizeUV = \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
- align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- src_a[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
- dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \
- } \
- /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
- const int kStrideC = kWidth * BPP_C; \
- align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
- align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
- memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
- memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
- FMT_PLANAR##To##FMT_C( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
- dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \
- /* Convert B to C */ \
- FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
- kWidth, kHeight); \
- for (int i = 0; i < kStrideC * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
- } \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(src_a); \
- free_aligned_buffer_page_end(dst_argb_b); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_bc); \
- }
-
-#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- FMT_C, BPP_C) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
-
-TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
-TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
-
-#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
- OFF, FMT_C, BPP_C) \
- TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_##FMT_C##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
- const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
- align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \
- align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
- MemRandomize(src_argb_a + OFF, kStrideA * kHeight); \
- memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
- const int kStrideC = kWidth * BPP_C; \
- align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
- align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
- memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
- memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
- FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \
- kWidth, NEG kHeight); \
- /* Convert B to C */ \
- FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
- kWidth, kHeight); \
- for (int i = 0; i < kStrideC * kHeight; i += 4) { \
- EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]); \
- EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]); \
- EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]); \
- EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64); \
- } \
- free_aligned_buffer_page_end(src_argb_a); \
- free_aligned_buffer_page_end(dst_argb_b); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_bc); \
- }
-
-#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
- TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, \
- benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
- _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
- _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
- _Opt, +, 0, FMT_C, BPP_C)
-
-// Caveat: Destination needs to be 4 bytes
-TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
-TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
-TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
-TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
-TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
-TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
-TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
-TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
-
-TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
- // 2x2 frames
- uint32_t src[4];
- uint32_t dst[4];
- // some random input
- src[0] = 0x11000000;
- src[1] = 0x00450000;
- src[2] = 0x00009f00;
- src[3] = 0x000000ff;
- // zeros on destination
- dst[0] = 0x00000000;
- dst[1] = 0x00000000;
- dst[2] = 0x00000000;
- dst[3] = 0x00000000;
-
- int r = ConvertToARGB(reinterpret_cast<uint8_t*>(src),
- 16, // input size
- reinterpret_cast<uint8_t*>(dst),
- 8, // destination stride
- 0, // crop_x
- 0, // crop_y
- 2, // width
- 2, // height
- 2, // crop width
- 2, // crop height
- kRotate90, FOURCC_ARGB);
-
- EXPECT_EQ(r, 0);
- // 90 degrees rotation, no conversion
- EXPECT_EQ(dst[0], src[2]);
- EXPECT_EQ(dst[1], src[0]);
- EXPECT_EQ(dst[2], src[3]);
- EXPECT_EQ(dst[3], src[1]);
-}
-
-#ifdef HAS_ARGBTOAR30ROW_AVX2
-TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
- // ARGBToAR30Row_AVX2 expects a multiple of 8 pixels.
- const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
- align_buffer_page_end(src, kPixels * 4);
- align_buffer_page_end(dst_opt, kPixels * 4);
- align_buffer_page_end(dst_c, kPixels * 4);
- MemRandomize(src, kPixels * 4);
- memset(dst_opt, 0, kPixels * 4);
- memset(dst_c, 1, kPixels * 4);
-
- ARGBToAR30Row_C(src, dst_c, kPixels);
-
- int has_avx2 = TestCpuFlag(kCpuHasAVX2);
- int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
- for (int i = 0; i < benchmark_iterations_; ++i) {
- if (has_avx2) {
- ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
- } else if (has_ssse3) {
- ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
- } else {
- ARGBToAR30Row_C(src, dst_opt, kPixels);
- }
- }
- for (int i = 0; i < kPixels * 4; ++i) {
- EXPECT_EQ(dst_opt[i], dst_c[i]);
- }
-
- free_aligned_buffer_page_end(src);
- free_aligned_buffer_page_end(dst_opt);
- free_aligned_buffer_page_end(dst_c);
-}
-#endif // HAS_ARGBTOAR30ROW_AVX2
-
-#ifdef HAS_ABGRTOAR30ROW_AVX2
-TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
- // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
- const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
- align_buffer_page_end(src, kPixels * 4);
- align_buffer_page_end(dst_opt, kPixels * 4);
- align_buffer_page_end(dst_c, kPixels * 4);
- MemRandomize(src, kPixels * 4);
- memset(dst_opt, 0, kPixels * 4);
- memset(dst_c, 1, kPixels * 4);
-
- ABGRToAR30Row_C(src, dst_c, kPixels);
-
- int has_avx2 = TestCpuFlag(kCpuHasAVX2);
- int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
- for (int i = 0; i < benchmark_iterations_; ++i) {
- if (has_avx2) {
- ABGRToAR30Row_AVX2(src, dst_opt, kPixels);
- } else if (has_ssse3) {
- ABGRToAR30Row_SSSE3(src, dst_opt, kPixels);
- } else {
- ABGRToAR30Row_C(src, dst_opt, kPixels);
- }
- }
- for (int i = 0; i < kPixels * 4; ++i) {
- EXPECT_EQ(dst_opt[i], dst_c[i]);
- }
-
- free_aligned_buffer_page_end(src);
- free_aligned_buffer_page_end(dst_opt);
- free_aligned_buffer_page_end(dst_c);
-}
-#endif // HAS_ABGRTOAR30ROW_AVX2
-
-// TODO(fbarchard): Fix clamping issue affected by U channel.
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- const int kBpc = 2; \
- align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
- align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
- align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
- reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
- } \
- memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B( \
- reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
- reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
- reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
- dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B( \
- reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
- reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
- reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
- dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) - \
- static_cast<int>(dst_argb_opt[i + DOFF])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)
-
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2)
-
-static int Clamp(int y) {
- if (y < 0) {
- y = 0;
- }
- if (y > 255) {
- y = 255;
- }
- return y;
-}
-
-static int Clamp10(int y) {
- if (y < 0) {
- y = 0;
- }
- if (y > 1023) {
- y = 1023;
- }
- return y;
-}
-
-// Test 8 bit YUV to 8 bit RGB
-TEST_F(LibYUVConvertTest, TestH420ToARGB) {
- const int kSize = 256;
- int histogram_b[256];
- int histogram_g[256];
- int histogram_r[256];
- memset(histogram_b, 0, sizeof(histogram_b));
- memset(histogram_g, 0, sizeof(histogram_g));
- memset(histogram_r, 0, sizeof(histogram_r));
- align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
- align_buffer_page_end(argb_pixels, kSize * 4);
- uint8_t* orig_y = orig_yuv;
- uint8_t* orig_u = orig_y + kSize;
- uint8_t* orig_v = orig_u + kSize / 2;
-
- // Test grey scale
- for (int i = 0; i < kSize; ++i) {
- orig_y[i] = i;
- }
- for (int i = 0; i < kSize / 2; ++i) {
- orig_u[i] = 128; // 128 is 0.
- orig_v[i] = 128;
- }
-
- H420ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
-
- for (int i = 0; i < kSize; ++i) {
- int b = argb_pixels[i * 4 + 0];
- int g = argb_pixels[i * 4 + 1];
- int r = argb_pixels[i * 4 + 2];
- int a = argb_pixels[i * 4 + 3];
- ++histogram_b[b];
- ++histogram_g[g];
- ++histogram_r[r];
- int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f));
- EXPECT_NEAR(b, expected_y, 1);
- EXPECT_NEAR(g, expected_y, 1);
- EXPECT_NEAR(r, expected_y, 1);
- EXPECT_EQ(a, 255);
- }
-
- int count_b = 0;
- int count_g = 0;
- int count_r = 0;
- for (int i = 0; i < kSize; ++i) {
- if (histogram_b[i]) {
- ++count_b;
- }
- if (histogram_g[i]) {
- ++count_g;
- }
- if (histogram_r[i]) {
- ++count_r;
- }
- }
- printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
- free_aligned_buffer_page_end(orig_yuv);
- free_aligned_buffer_page_end(argb_pixels);
-}
-
-// Test 10 bit YUV to 8 bit RGB
-TEST_F(LibYUVConvertTest, TestH010ToARGB) {
- const int kSize = 1024;
- int histogram_b[1024];
- int histogram_g[1024];
- int histogram_r[1024];
- memset(histogram_b, 0, sizeof(histogram_b));
- memset(histogram_g, 0, sizeof(histogram_g));
- memset(histogram_r, 0, sizeof(histogram_r));
- align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
- align_buffer_page_end(argb_pixels, kSize * 4);
- uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
- uint16_t* orig_u = orig_y + kSize;
- uint16_t* orig_v = orig_u + kSize / 2;
-
- // Test grey scale
- for (int i = 0; i < kSize; ++i) {
- orig_y[i] = i;
- }
- for (int i = 0; i < kSize / 2; ++i) {
- orig_u[i] = 512; // 512 is 0.
- orig_v[i] = 512;
- }
-
- H010ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
-
- for (int i = 0; i < kSize; ++i) {
- int b = argb_pixels[i * 4 + 0];
- int g = argb_pixels[i * 4 + 1];
- int r = argb_pixels[i * 4 + 2];
- int a = argb_pixels[i * 4 + 3];
- ++histogram_b[b];
- ++histogram_g[g];
- ++histogram_r[r];
- int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
- EXPECT_NEAR(b, expected_y, 1);
- EXPECT_NEAR(g, expected_y, 1);
- EXPECT_NEAR(r, expected_y, 1);
- EXPECT_EQ(a, 255);
- }
-
- int count_b = 0;
- int count_g = 0;
- int count_r = 0;
- for (int i = 0; i < kSize; ++i) {
- if (histogram_b[i]) {
- ++count_b;
- }
- if (histogram_g[i]) {
- ++count_g;
- }
- if (histogram_r[i]) {
- ++count_r;
- }
- }
- printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
- free_aligned_buffer_page_end(orig_yuv);
- free_aligned_buffer_page_end(argb_pixels);
-}
-
-// Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected
-// result.
-TEST_F(LibYUVConvertTest, TestH010ToAR30) {
- const int kSize = 1024;
- int histogram_b[1024];
- int histogram_g[1024];
- int histogram_r[1024];
- memset(histogram_b, 0, sizeof(histogram_b));
- memset(histogram_g, 0, sizeof(histogram_g));
- memset(histogram_r, 0, sizeof(histogram_r));
-
- align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
- align_buffer_page_end(ar30_pixels, kSize * 4);
- uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
- uint16_t* orig_u = orig_y + kSize;
- uint16_t* orig_v = orig_u + kSize / 2;
-
- // Test grey scale
- for (int i = 0; i < kSize; ++i) {
- orig_y[i] = i;
- }
- for (int i = 0; i < kSize / 2; ++i) {
- orig_u[i] = 512; // 512 is 0.
- orig_v[i] = 512;
- }
-
- H010ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
-
- for (int i = 0; i < kSize; ++i) {
- int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
- int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
- int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
- int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
- ++histogram_b[b10];
- ++histogram_g[g10];
- ++histogram_r[r10];
- int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
- EXPECT_NEAR(b10, expected_y, 4);
- EXPECT_NEAR(g10, expected_y, 4);
- EXPECT_NEAR(r10, expected_y, 4);
- EXPECT_EQ(a2, 3);
- }
-
- int count_b = 0;
- int count_g = 0;
- int count_r = 0;
- for (int i = 0; i < kSize; ++i) {
- if (histogram_b[i]) {
- ++count_b;
- }
- if (histogram_g[i]) {
- ++count_g;
- }
- if (histogram_r[i]) {
- ++count_r;
- }
- }
- printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
- free_aligned_buffer_page_end(orig_yuv);
- free_aligned_buffer_page_end(ar30_pixels);
-}
-
-// Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected
-// result.
-TEST_F(LibYUVConvertTest, TestH010ToAB30) {
- const int kSize = 1024;
- int histogram_b[1024];
- int histogram_g[1024];
- int histogram_r[1024];
- memset(histogram_b, 0, sizeof(histogram_b));
- memset(histogram_g, 0, sizeof(histogram_g));
- memset(histogram_r, 0, sizeof(histogram_r));
-
- align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
- align_buffer_page_end(ab30_pixels, kSize * 4);
- uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
- uint16_t* orig_u = orig_y + kSize;
- uint16_t* orig_v = orig_u + kSize / 2;
-
- // Test grey scale
- for (int i = 0; i < kSize; ++i) {
- orig_y[i] = i;
- }
- for (int i = 0; i < kSize / 2; ++i) {
- orig_u[i] = 512; // 512 is 0.
- orig_v[i] = 512;
- }
-
- H010ToAB30(orig_y, 0, orig_u, 0, orig_v, 0, ab30_pixels, 0, kSize, 1);
-
- for (int i = 0; i < kSize; ++i) {
- int r10 = reinterpret_cast<uint32_t*>(ab30_pixels)[i] & 1023;
- int g10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 10) & 1023;
- int b10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 20) & 1023;
- int a2 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 30) & 3;
- ++histogram_b[b10];
- ++histogram_g[g10];
- ++histogram_r[r10];
- int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
- EXPECT_NEAR(b10, expected_y, 4);
- EXPECT_NEAR(g10, expected_y, 4);
- EXPECT_NEAR(r10, expected_y, 4);
- EXPECT_EQ(a2, 3);
- }
-
- int count_b = 0;
- int count_g = 0;
- int count_r = 0;
- for (int i = 0; i < kSize; ++i) {
- if (histogram_b[i]) {
- ++count_b;
- }
- if (histogram_g[i]) {
- ++count_g;
- }
- if (histogram_r[i]) {
- ++count_r;
- }
- }
- printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
- free_aligned_buffer_page_end(orig_yuv);
- free_aligned_buffer_page_end(ab30_pixels);
-}
-
-// Test 8 bit YUV to 10 bit RGB
-TEST_F(LibYUVConvertTest, TestH420ToAR30) {
- const int kSize = 256;
- const int kHistSize = 1024;
- int histogram_b[kHistSize];
- int histogram_g[kHistSize];
- int histogram_r[kHistSize];
- memset(histogram_b, 0, sizeof(histogram_b));
- memset(histogram_g, 0, sizeof(histogram_g));
- memset(histogram_r, 0, sizeof(histogram_r));
- align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
- align_buffer_page_end(ar30_pixels, kSize * 4);
- uint8_t* orig_y = orig_yuv;
- uint8_t* orig_u = orig_y + kSize;
- uint8_t* orig_v = orig_u + kSize / 2;
-
- // Test grey scale
- for (int i = 0; i < kSize; ++i) {
- orig_y[i] = i;
- }
- for (int i = 0; i < kSize / 2; ++i) {
- orig_u[i] = 128; // 128 is 0.
- orig_v[i] = 128;
- }
-
- H420ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
-
- for (int i = 0; i < kSize; ++i) {
- int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
- int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
- int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
- int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
- ++histogram_b[b10];
- ++histogram_g[g10];
- ++histogram_r[r10];
- int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
- EXPECT_NEAR(b10, expected_y, 4);
- EXPECT_NEAR(g10, expected_y, 4);
- EXPECT_NEAR(r10, expected_y, 4);
- EXPECT_EQ(a2, 3);
- }
-
- int count_b = 0;
- int count_g = 0;
- int count_r = 0;
- for (int i = 0; i < kHistSize; ++i) {
- if (histogram_b[i]) {
- ++count_b;
- }
- if (histogram_g[i]) {
- ++count_g;
- }
- if (histogram_r[i]) {
- ++count_r;
- }
- }
- printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
- free_aligned_buffer_page_end(orig_yuv);
- free_aligned_buffer_page_end(ar30_pixels);
-}
-
-// Test RGB24 to ARGB and back to RGB24
-TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
- const int kSize = 256;
- align_buffer_page_end(orig_rgb24, kSize * 3);
- align_buffer_page_end(argb_pixels, kSize * 4);
- align_buffer_page_end(dest_rgb24, kSize * 3);
-
- // Test grey scale
- for (int i = 0; i < kSize * 3; ++i) {
- orig_rgb24[i] = i;
- }
-
- RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
- ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
-
- for (int i = 0; i < kSize * 3; ++i) {
- EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
- }
-
- free_aligned_buffer_page_end(orig_rgb24);
- free_aligned_buffer_page_end(argb_pixels);
- free_aligned_buffer_page_end(dest_rgb24);
-}
-
-} // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
deleted file mode 100644
index a7991d2b..00000000
--- a/files/unit_test/cpu_test.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "../unit_test/unit_test.h"
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/version.h"
-
-namespace libyuv {
-
-TEST_F(LibYUVBaseTest, TestCpuHas) {
- int cpu_flags = TestCpuFlag(-1);
- printf("Cpu Flags %d\n", cpu_flags);
-#if defined(__arm__) || defined(__aarch64__)
- int has_arm = TestCpuFlag(kCpuHasARM);
- printf("Has ARM %d\n", has_arm);
- int has_neon = TestCpuFlag(kCpuHasNEON);
- printf("Has NEON %d\n", has_neon);
-#endif
- int has_x86 = TestCpuFlag(kCpuHasX86);
- int has_sse2 = TestCpuFlag(kCpuHasSSE2);
- int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
- int has_sse41 = TestCpuFlag(kCpuHasSSE41);
- int has_sse42 = TestCpuFlag(kCpuHasSSE42);
- int has_avx = TestCpuFlag(kCpuHasAVX);
- int has_avx2 = TestCpuFlag(kCpuHasAVX2);
- int has_erms = TestCpuFlag(kCpuHasERMS);
- int has_fma3 = TestCpuFlag(kCpuHasFMA3);
- int has_f16c = TestCpuFlag(kCpuHasF16C);
- int has_gfni = TestCpuFlag(kCpuHasGFNI);
- int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
- int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
- int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
- int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
- int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
- int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
- printf("Has X86 %d\n", has_x86);
- printf("Has SSE2 %d\n", has_sse2);
- printf("Has SSSE3 %d\n", has_ssse3);
- printf("Has SSE41 %d\n", has_sse41);
- printf("Has SSE42 %d\n", has_sse42);
- printf("Has AVX %d\n", has_avx);
- printf("Has AVX2 %d\n", has_avx2);
- printf("Has ERMS %d\n", has_erms);
- printf("Has FMA3 %d\n", has_fma3);
- printf("Has F16C %d\n", has_f16c);
- printf("Has GFNI %d\n", has_gfni);
- printf("Has AVX512BW %d\n", has_avx512bw);
- printf("Has AVX512VL %d\n", has_avx512vl);
- printf("Has AVX512VBMI %d\n", has_avx512vbmi);
- printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2);
- printf("Has AVX512VBITALG %d\n", has_avx512vbitalg);
- printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq);
-
-#if defined(__mips__)
- int has_mips = TestCpuFlag(kCpuHasMIPS);
- printf("Has MIPS %d\n", has_mips);
- int has_msa = TestCpuFlag(kCpuHasMSA);
- printf("Has MSA %d\n", has_msa);
- int has_mmi = TestCpuFlag(kCpuHasMMI);
- printf("Has MMI %d\n", has_mmi);
-#endif
-}
-
-TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
-#if defined(__aarch64__)
- printf("Arm64 build\n");
-#endif
-#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
- printf("Neon build enabled\n");
-#endif
-#if defined(__x86_64__) || defined(_M_X64)
- printf("x64 build\n");
-#endif
-#ifdef _MSC_VER
- printf("_MSC_VER %d\n", _MSC_VER);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
- defined(VISUALC_HAS_AVX2))
- printf("Has AVX2 1\n");
-#else
- printf("Has AVX2 0\n");
-// If compiler does not support AVX2, the following function not expected:
-#endif
-}
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
- defined(_M_X64)
-TEST_F(LibYUVBaseTest, TestCpuId) {
- int has_x86 = TestCpuFlag(kCpuHasX86);
- if (has_x86) {
- int cpu_info[4];
- // Vendor ID:
- // AuthenticAMD AMD processor
- // CentaurHauls Centaur processor
- // CyrixInstead Cyrix processor
- // GenuineIntel Intel processor
- // GenuineTMx86 Transmeta processor
- // Geode by NSC National Semiconductor processor
- // NexGenDriven NexGen processor
- // RiseRiseRise Rise Technology processor
- // SiS SiS SiS SiS processor
- // UMC UMC UMC UMC processor
- CpuId(0, 0, cpu_info);
- cpu_info[0] = cpu_info[1]; // Reorder output
- cpu_info[1] = cpu_info[3];
- cpu_info[3] = 0;
- printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
- cpu_info[0], cpu_info[1], cpu_info[2]);
- EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
-
- // CPU Family and Model
- // 3:0 - Stepping
- // 7:4 - Model
- // 11:8 - Family
- // 13:12 - Processor Type
- // 19:16 - Extended Model
- // 27:20 - Extended Family
- CpuId(1, 0, cpu_info);
- int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
- int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
- printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
- model);
- }
-}
-#endif
-
-static int FileExists(const char* file_name) {
- FILE* f = fopen(file_name, "r");
- if (!f) {
- return 0;
- }
- fclose(f);
- return 1;
-}
-
-TEST_F(LibYUVBaseTest, TestLinuxNeon) {
- if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
- printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");
-
- EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
- EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
- EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
- } else {
- printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
- }
-#if defined(__linux__) && defined(__ARM_NEON__)
- EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
-#endif
-}
-
-TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
- // Reset any masked flags that may have been set so auto init is enabled.
- MaskCpuFlags(0);
-
- int original_cpu_flags = TestCpuFlag(-1);
-
- // Test setting different CPU configurations.
- int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
- SetCpuFlags(cpu_flags);
- EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
-
- cpu_flags = kCpuHasX86 | kCpuInitialized;
- SetCpuFlags(cpu_flags);
- EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
-
- // Test that setting 0 turns auto-init back on.
- SetCpuFlags(0);
- EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));
-
- // Restore the CPU flag mask.
- MaskCpuFlags(benchmark_cpu_info_);
-}
-
-} // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
deleted file mode 100644
index 61941e63..00000000
--- a/files/unit_test/rotate_test.cc
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "../unit_test/unit_test.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/rotate.h"
-
-namespace libyuv {
-
-static void I420TestRotate(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- libyuv::RotationMode mode,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
- if (src_width < 1) {
- src_width = 1;
- }
- if (src_height == 0) {
- src_height = 1;
- }
- if (dst_width < 1) {
- dst_width = 1;
- }
- if (dst_height < 1) {
- dst_height = 1;
- }
- int src_i420_y_size = src_width * Abs(src_height);
- int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
- int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
- align_buffer_page_end(src_i420, src_i420_size);
- for (int i = 0; i < src_i420_size; ++i) {
- src_i420[i] = fastrand() & 0xff;
- }
-
- int dst_i420_y_size = dst_width * dst_height;
- int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
- int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
- align_buffer_page_end(dst_i420_c, dst_i420_size);
- align_buffer_page_end(dst_i420_opt, dst_i420_size);
- memset(dst_i420_c, 2, dst_i420_size);
- memset(dst_i420_opt, 3, dst_i420_size);
-
- MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
- (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
- (src_width + 1) / 2, dst_i420_c, dst_width,
- dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
- dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2, src_width, src_height, mode);
-
- MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
- for (int i = 0; i < benchmark_iterations; ++i) {
- I420Rotate(
- src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
- src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
- dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
- (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2, src_width, src_height, mode);
- }
-
- // Rotation should be exact.
- for (int i = 0; i < dst_i420_size; ++i) {
- EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
- }
-
- free_aligned_buffer_page_end(dst_i420_c);
- free_aligned_buffer_page_end(dst_i420_opt);
- free_aligned_buffer_page_end(src_i420);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-// TODO(fbarchard): Remove odd width tests.
-// Odd width tests work but disabled because they use C code and can be
-// tested by passing an odd width command line or environment variable.
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
- I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
- I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
- I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
- I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-static void I444TestRotate(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- libyuv::RotationMode mode,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
- if (src_width < 1) {
- src_width = 1;
- }
- if (src_height == 0) {
- src_height = 1;
- }
- if (dst_width < 1) {
- dst_width = 1;
- }
- if (dst_height < 1) {
- dst_height = 1;
- }
- int src_i444_y_size = src_width * Abs(src_height);
- int src_i444_uv_size = src_width * Abs(src_height);
- int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
- align_buffer_page_end(src_i444, src_i444_size);
- for (int i = 0; i < src_i444_size; ++i) {
- src_i444[i] = fastrand() & 0xff;
- }
-
- int dst_i444_y_size = dst_width * dst_height;
- int dst_i444_uv_size = dst_width * dst_height;
- int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
- align_buffer_page_end(dst_i444_c, dst_i444_size);
- align_buffer_page_end(dst_i444_opt, dst_i444_size);
- memset(dst_i444_c, 2, dst_i444_size);
- memset(dst_i444_opt, 3, dst_i444_size);
-
- MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
- src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
- dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
- dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
- src_width, src_height, mode);
-
- MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
- for (int i = 0; i < benchmark_iterations; ++i) {
- I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
- src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
- dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
- dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
- dst_width, src_width, src_height, mode);
- }
-
- // Rotation should be exact.
- for (int i = 0; i < dst_i444_size; ++i) {
- EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
- }
-
- free_aligned_buffer_page_end(dst_i444_c);
- free_aligned_buffer_page_end(dst_i444_opt);
- free_aligned_buffer_page_end(src_i444);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
- I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
- I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
- I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
- I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-// TODO(fbarchard): Remove odd width tests.
-// Odd width tests work but disabled because they use C code and can be
-// tested by passing an odd width command line or environment variable.
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
- I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
- I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
- I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
- I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-static void NV12TestRotate(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- libyuv::RotationMode mode,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
- if (src_width < 1) {
- src_width = 1;
- }
- if (src_height == 0) { // allow negative for inversion test.
- src_height = 1;
- }
- if (dst_width < 1) {
- dst_width = 1;
- }
- if (dst_height < 1) {
- dst_height = 1;
- }
- int src_nv12_y_size = src_width * Abs(src_height);
- int src_nv12_uv_size =
- ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
- int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
- align_buffer_page_end(src_nv12, src_nv12_size);
- for (int i = 0; i < src_nv12_size; ++i) {
- src_nv12[i] = fastrand() & 0xff;
- }
-
- int dst_i420_y_size = dst_width * dst_height;
- int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
- int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
- align_buffer_page_end(dst_i420_c, dst_i420_size);
- align_buffer_page_end(dst_i420_opt, dst_i420_size);
- memset(dst_i420_c, 2, dst_i420_size);
- memset(dst_i420_opt, 3, dst_i420_size);
-
- MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
- (src_width + 1) & ~1, dst_i420_c, dst_width,
- dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
- dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2, src_width, src_height, mode);
-
- MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
- for (int i = 0; i < benchmark_iterations; ++i) {
- NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
- (src_width + 1) & ~1, dst_i420_opt, dst_width,
- dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
- dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2, src_width, src_height, mode);
- }
-
- // Rotation should be exact.
- for (int i = 0; i < dst_i420_size; ++i) {
- EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
- }
-
- free_aligned_buffer_page_end(dst_i420_c);
- free_aligned_buffer_page_end(dst_i420_opt);
- free_aligned_buffer_page_end(src_nv12);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
- NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
- NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
- NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
- NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
- benchmark_iterations_, disable_cpu_flags_,
- benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
- benchmark_height_, kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
- benchmark_width_, kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-} // namespace libyuv
diff --git a/files/util/android/test_runner.py b/files/util/android/test_runner.py
deleted file mode 100755
index 8b06b7ea..00000000
--- a/files/util/android/test_runner.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""
-Runs tests on Android devices.
-
-This script exists to avoid Libyuv being broken by changes in the Chrome Android
-test execution toolchain. It also conveniently sets the CHECKOUT_SOURCE_ROOT
-environment variable.
-"""
-
-import os
-import sys
-
-SCRIPT_DIR = os.path.dirname(__file__)
-ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir))
-CHROMIUM_BUILD_ANDROID_DIR = os.path.join(ROOT_DIR, 'build', 'android')
-sys.path.insert(0, CHROMIUM_BUILD_ANDROID_DIR)
-
-
-import test_runner # pylint: disable=W0406
-
-def main():
- # Override environment variable to make it possible for the scripts to find
- # the root directory (our symlinking of the Chromium build toolchain would
- # otherwise make them fail to do so).
- os.environ['CHECKOUT_SOURCE_ROOT'] = ROOT_DIR
- return test_runner.main()
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/files/fuzz/Android.bp b/fuzz/Android.bp
index 0e495899..a8d552b1 100644
--- a/files/fuzz/Android.bp
+++ b/fuzz/Android.bp
@@ -2,10 +2,10 @@
package {
// See: http://go/android-license-faq
// A large-scale-change added 'default_applicable_licenses' to import
- // all of the 'license_kinds' from "external_libyuv_files_license"
+ // all of the 'license_kinds' from "external_libyuv_license"
// to get the below license kinds:
// SPDX-license-identifier-BSD
- default_applicable_licenses: ["external_libyuv_files_license"],
+ default_applicable_licenses: ["external_libyuv_license"],
}
cc_fuzz {
diff --git a/files/fuzz/OWNERS b/fuzz/OWNERS
index 37481f5d..37481f5d 100644
--- a/files/fuzz/OWNERS
+++ b/fuzz/OWNERS
diff --git a/files/fuzz/mjpeg_dec_fuzz.cc b/fuzz/mjpeg_dec_fuzz.cc
index 3be8410a..3be8410a 100644
--- a/files/fuzz/mjpeg_dec_fuzz.cc
+++ b/fuzz/mjpeg_dec_fuzz.cc
diff --git a/files/include/libyuv.h b/include/libyuv.h
index aeffd5ef..a06e1233 100644
--- a/files/include/libyuv.h
+++ b/include/libyuv.h
@@ -26,6 +26,7 @@
#include "libyuv/scale.h"
#include "libyuv/scale_argb.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"
#include "libyuv/version.h"
#include "libyuv/video_common.h"
diff --git a/files/include/libyuv/basic_types.h b/include/libyuv/basic_types.h
index 1bea67f2..1bea67f2 100644
--- a/files/include/libyuv/basic_types.h
+++ b/include/libyuv/basic_types.h
diff --git a/files/include/libyuv/compare.h b/include/libyuv/compare.h
index 3353ad71..3353ad71 100644
--- a/files/include/libyuv/compare.h
+++ b/include/libyuv/compare.h
diff --git a/files/include/libyuv/compare_row.h b/include/libyuv/compare_row.h
index e95b9d93..8293c919 100644
--- a/files/include/libyuv/compare_row.h
+++ b/include/libyuv/compare_row.h
@@ -28,7 +28,10 @@ extern "C" {
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
#define LIBYUV_DISABLE_X86
#endif
#endif
@@ -55,20 +58,20 @@ extern "C" {
// The following are available for Visual C and clangcl 32 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+ !defined(__clang__) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
#define HAS_SUMSQUAREERROR_AVX2
#endif
-// The following are available for GCC and clangcl 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+// The following are available for GCC and clangcl:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#define HAS_HAMMINGDISTANCE_SSSE3
#endif
-// The following are available for GCC and clangcl 64 bit:
+// The following are available for GCC and clangcl:
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+ (defined(__x86_64__) || defined(__i386__))
#define HAS_HAMMINGDISTANCE_AVX2
#endif
@@ -84,11 +87,6 @@ extern "C" {
#define HAS_SUMSQUAREERROR_MSA
#endif
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_HAMMINGDISTANCE_MMI
-#define HAS_SUMSQUAREERROR_MMI
-#endif
-
uint32_t HammingDistance_C(const uint8_t* src_a,
const uint8_t* src_b,
int count);
@@ -107,9 +105,6 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t HammingDistance_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count);
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
- const uint8_t* src_b,
- int count);
uint32_t SumSquareError_C(const uint8_t* src_a,
const uint8_t* src_b,
int count);
@@ -125,9 +120,6 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
uint32_t SumSquareError_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count);
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
- const uint8_t* src_b,
- int count);
uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
new file mode 100644
index 00000000..88619a4f
--- /dev/null
+++ b/include/libyuv/convert.h
@@ -0,0 +1,1045 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h" // For enum RotationMode.
+
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I422 to I444.
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I422 to I210.
+LIBYUV_API
+int I422ToI210(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert MM21 to NV12.
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert MM21 to I420.
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert MM21 to YUY2
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
+
+// Convert MT2T to P010
+// Note that src_y and src_uv point to packed 10-bit values, so the Y plane will
+// be 10 / 8 times the dimensions of the image. Also for this reason,
+// src_stride_y and src_stride_uv are given in bytes.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I420 to I444.
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H210ToH420 I210ToI420
+LIBYUV_API
+int I210ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H210ToH422 I210ToI422
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H410ToH420 I410ToI420
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H410ToH444 I410ToI444
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H012ToH420 I012ToI420
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H212ToH422 I212ToI422
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H212ToH420 I212ToI420
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H412ToH444 I412ToI444
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define H412ToH420 I412ToI420
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define I412ToI012 I410ToI010
+#define H410ToH010 I410ToI010
+#define H412ToH012 I410ToI010
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define I212ToI012 I210ToI010
+#define H210ToH010 I210ToI010
+#define H212ToH012 I210ToI010
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I010 to I410
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I012 to I412
+#define I012ToI412 I010ToI410
+
+// Convert I210 to I410
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I212 to I412
+#define I212ToI412 I210ToI410
+
+// Convert I010 to P010
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I210 to P210
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I012 to P012
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I212 to P212
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert NV12 to NV24.
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert NV16 to NV24.
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert P010 to I010.
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert P012 to I012.
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert P010 to P410.
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert P012 to P412.
+#define P012ToP412 P010ToP410
+
+// Convert P016 to P416.
+#define P016ToP416 P010ToP410
+
+// Convert P210 to P410.
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert P212 to P412.
+#define P212ToP412 P210ToP410
+
+// Convert P216 to P416.
+#define P216ToP416 P210ToP410
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8_t* sample,
+ size_t sample_size,
+ int* width,
+ int* height);
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
new file mode 100644
index 00000000..35eeac9b
--- /dev/null
+++ b/include/libyuv/convert_argb.h
@@ -0,0 +1,2315 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h" // For enum RotationMode.
+#include "libyuv/scale.h" // For enum FilterMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvV2020Constants; // BT.2020 full
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuvF709ConstantsVU kYvuF709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define kYuvV2020ConstantsVU kYvuV2020Constants
+
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+ I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I422 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I422AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I422 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I422AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I444 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I444AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I444 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I444AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV12 to ABGR.
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert AB64 to ABGR.
+#define AB64ToABGR AR64ToARGB
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+ int src_stride_ab64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert AR64 to ABGR.
+#define AR64ToABGR AB64ToARGB
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height);
+
+// Convert AB64 To AR64.
+#define AB64ToAR64 AR64ToAB64
+
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 420 YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 420 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert P010 to ARGB with matrix.
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert P210 to ARGB with matrix.
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert P010 to AR30 with matrix.
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert P210 to AR30 with matrix.
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// P012 and P010 use most significant bits so the conversion is the same.
+// Convert P012 to ARGB with matrix.
+#define P012ToARGBMatrix P010ToARGBMatrix
+// Convert P012 to AR30 with matrix.
+#define P012ToAR30Matrix P010ToAR30Matrix
+// Convert P212 to ARGB with matrix.
+#define P212ToARGBMatrix P210ToARGBMatrix
+// Convert P212 to AR30 with matrix.
+#define P212ToAR30Matrix P210ToAR30Matrix
+
+// Convert P016 to ARGB with matrix.
+#define P016ToARGBMatrix P010ToARGBMatrix
+// Convert P016 to AR30 with matrix.
+#define P016ToAR30Matrix P010ToAR30Matrix
+// Convert P216 to ARGB with matrix.
+#define P216ToARGBMatrix P210ToARGBMatrix
+// Convert P216 to AR30 with matrix.
+#define P216ToAR30Matrix P210ToAR30Matrix
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I422 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I422AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I444 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I444AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I420ToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I422 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I422ToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I422 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I420 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I010 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int I010ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I210 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int I210ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I010 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I010ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I210 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I210ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert I420 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter);
+
+// Convert I422 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I422AlphaToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter);
+
+// Convert I010 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter);
+
+// Convert I210 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter);
+
+// Convert P010 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int P010ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert P210 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int P210ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert P010 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int P010ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert P210 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int P210ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter);
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "sample_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/files/include/libyuv/convert_from.h b/include/libyuv/convert_from.h
index 861418d0..32f42a63 100644
--- a/files/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
@@ -23,6 +23,7 @@ extern "C" {
// Convert 8 bit YUV to 10 bit.
#define H420ToH010 I420ToI010
+LIBYUV_API
int I420ToI010(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
@@ -38,6 +39,24 @@ int I420ToI010(const uint8_t* src_y,
int width,
int height);
+// Convert 8 bit YUV to 12 bit.
+#define H420ToH012 I420ToI012
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
LIBYUV_API
int I420ToI422(const uint8_t* src_y,
int src_stride_y,
@@ -131,6 +150,10 @@ int I420ToUYVY(const uint8_t* src_y,
int width,
int height);
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future. Use convert_argb.h
+
+// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8_t* src_y,
int src_stride_y,
@@ -143,18 +166,7 @@ int I420ToARGB(const uint8_t* src_y,
int width,
int height);
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
-
+// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
@@ -167,181 +179,6 @@ int I420ToABGR(const uint8_t* src_y,
int width,
int height);
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height);
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height);
-
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/files/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index cbbef6fe..ff2a581a 100644
--- a/files/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
int width,
int height);
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8_t* src_argb,
@@ -149,6 +153,30 @@ int ARGBToI444(const uint8_t* src_argb,
int width,
int height);
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height);
+
+// Convert ABGR to AB64.
+#define ABGRToAB64 ARGBToAR64
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height);
+
+// Convert ABGR to AR64.
+#define ABGRToAR64 ARGBToAB64
+
// Convert ARGB To I422.
LIBYUV_API
int ARGBToI422(const uint8_t* src_argb,
@@ -181,10 +209,10 @@ int ARGBToJ420(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_yj,
int dst_stride_yj,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
int width,
int height);
@@ -194,10 +222,10 @@ int ARGBToJ422(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_yj,
int dst_stride_yj,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
int width,
int height);
@@ -210,6 +238,50 @@ int ARGBToJ400(const uint8_t* src_argb,
int width,
int height);
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
+ int width,
+ int height);
+
+// Convert ABGR to J422.
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
+ int width,
+ int height);
+
+// Convert ABGR to J400. (JPeg full range).
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// Convert RGBA to J400. (JPeg full range).
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
// Convert ARGB to I400.
LIBYUV_API
int ARGBToI400(const uint8_t* src_argb,
@@ -250,25 +322,25 @@ int ARGBToNV21(const uint8_t* src_argb,
int width,
int height);
-// Convert ARGB To NV21.
+// Convert ABGR To NV12.
LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
- int src_stride_argb,
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
int width,
int height);
-// Convert ABGR To NV12.
+// Convert ABGR To NV21.
LIBYUV_API
-int ABGRToNV12(const uint8_t* src_abgr,
+int ABGRToNV21(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
- uint8_t* dst_uv,
- int dst_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
int width,
int height);
@@ -290,6 +362,17 @@ int ARGBToUYVY(const uint8_t* src_argb,
int width,
int height);
+// RAW to JNV21 full range NV21
+LIBYUV_API
+int RAWToJNV21(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index b01cd25c..5a81e7c9 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -31,24 +31,35 @@ static const int kCpuHasX86 = 0x10;
static const int kCpuHasSSE2 = 0x20;
static const int kCpuHasSSSE3 = 0x40;
static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100; // unused at this time.
+static const int kCpuHasSSE42 = 0x100;
static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasF16C = 0x2000;
-static const int kCpuHasGFNI = 0x4000;
-static const int kCpuHasAVX512BW = 0x8000;
-static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512BW = 0x4000;
+static const int kCpuHasAVX512VL = 0x8000;
+static const int kCpuHasAVX512VNNI = 0x10000;
static const int kCpuHasAVX512VBMI = 0x20000;
static const int kCpuHasAVX512VBMI2 = 0x40000;
static const int kCpuHasAVX512VBITALG = 0x80000;
-static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
+static const int kCpuHasAVX10 = 0x100000;
+static const int kCpuHasAVXVNNI = 0x200000;
+static const int kCpuHasAVXVNNIINT8 = 0x400000;
// These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x200000;
-static const int kCpuHasMSA = 0x400000;
-static const int kCpuHasMMI = 0x800000;
+static const int kCpuHasMIPS = 0x800000;
+static const int kCpuHasMSA = 0x1000000;
+
+// These flags are only valid on LOONGARCH processors.
+static const int kCpuHasLOONGARCH = 0x2000000;
+static const int kCpuHasLSX = 0x4000000;
+static const int kCpuHasLASX = 0x8000000;
+
+// These flags are only valid on RISCV processors.
+static const int kCpuHasRISCV = 0x10000000;
+static const int kCpuHasRVV = 0x20000000;
+static const int kCpuHasRVVZVFH = 0x40000000;
// Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags.
@@ -71,6 +82,10 @@ static __inline int TestCpuFlag(int test_flag) {
// Internal function for parsing /proc/cpuinfo.
LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int RiscvCpuCaps(const char* cpuinfo_name);
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/include/libyuv/loongson_intrinsics.h b/include/libyuv/loongson_intrinsics.h
new file mode 100644
index 00000000..1d613def
--- /dev/null
+++ b/include/libyuv/loongson_intrinsics.h
@@ -0,0 +1,1949 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
+#define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
+
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ * Xiwei Gu <guxiwei-hf@loongson.cn>
+ * Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 1
+#define LSOM_VERSION_MICRO 0
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+ { \
+ _OUT0 = _INS(_IN0); \
+ _OUT1 = _INS(_IN1); \
+ }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+ { \
+ _OUT0 = _INS(_IN0, _IN1); \
+ _OUT1 = _INS(_IN2, _IN3); \
+ }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+ { \
+ _OUT0 = _INS(_IN0, _IN1, _IN2); \
+ _OUT1 = _INS(_IN3, _IN4, _IN5); \
+ }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
+ DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+ _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
+ DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+ _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
+ DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
+ }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * Then the results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
+ __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * unsigned byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * The results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
+ __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * The results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ * in_c : 1,1,1,1, 1,1,1,1
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ * out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
+ __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - __m128i
+ * Details : Signed half-word elements from in_h are multiplied by
+ * signed half-word elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * Then the results plus to signed word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ * in_c : 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1
+ * out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
+ __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * Example : out = __lsx_vdp2_h_b(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_h_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * unsigned byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_h_bu(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ * out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * Example : out = __lsx_vdp2_w_h(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_w_h(in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ * (_in))
+ * Arguments : Inputs - _in (input vector)
+ * - min (min threshold)
+ * - max (max threshold)
+ * Outputs - out (output vector with clipped elements)
+ * Return Type - signed halfword
+ * Example : out = __lsx_vclip_h(_in)
+ * _in : -8,2,280,249, -8,255,280,249
+ * min : 1,1,1,1, 1,1,1,1
+ * max : 9,9,9,9, 9,9,9,9
+ * out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+ __m128i out;
+
+ out = __lsx_vmax_h(min, _in);
+ out = __lsx_vmin_h(max, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments : Inputs - _in
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from _in are clamped between 0 and 255.
+ * Example : out = __lsx_vclip255_h(_in)
+ * _in : -8,255,280,249, -8,255,280,249
+ * out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+ __m128i out;
+
+ out = __lsx_vmaxi_h(_in, 0);
+ out = __lsx_vsat_hu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments : Inputs - _in
+ * Outputs - out
+ * Return Type - word
+ * Details : Signed byte elements from _in are clamped between 0 and 255.
+ * Example : out = __lsx_vclip255_w(_in)
+ * _in : -8,255,280,249
+ * out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+ __m128i out;
+
+ out = __lsx_vmaxi_w(_in, 0);
+ out = __lsx_vsat_wu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments : Inputs - _in0, _in1
+ * Outputs - _in0, _in1 (in-place)
+ * Details : Swapping of two input variables using xor
+ * Example : LSX_SWAP(_in0, _in1)
+ * _in0 : 1,2,3,4
+ * _in1 : 5,6,7,8
+ * _in0(out) : 5,6,7,8
+ * _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1) \
+ { \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ _in1 = __lsx_vxor_v(_in0, _in1); \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4 1, 5, 9,13
+ * 5, 6, 7, 8 to 2, 6,10,14
+ * 9,10,11,12 =====> 3, 7,11,15
+ * 13,14,15,16 4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _t0, _t1, _t2, _t3; \
+ \
+ _t0 = __lsx_vilvl_w(_in1, _in0); \
+ _t1 = __lsx_vilvh_w(_in1, _in0); \
+ _t2 = __lsx_vilvl_w(_in3, _in2); \
+ _t3 = __lsx_vilvh_w(_in3, _in2); \
+ _out0 = __lsx_vilvl_d(_t2, _t0); \
+ _out1 = __lsx_vilvh_d(_t2, _t0); \
+ _out2 = __lsx_vilvl_d(_t3, _t1); \
+ _out3 = __lsx_vilvh_d(_t3, _t1); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Details : The rows of the matrix become columns, and the columns
+ * become rows.
+ * Example : LSX_TRANSPOSE8x8_B
+ * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i zero = {0}; \
+ __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _t0 = __lsx_vilvl_b(_in2, _in0); \
+ _t1 = __lsx_vilvl_b(_in3, _in1); \
+ _t2 = __lsx_vilvl_b(_in6, _in4); \
+ _t3 = __lsx_vilvl_b(_in7, _in5); \
+ _t4 = __lsx_vilvl_b(_t1, _t0); \
+ _t5 = __lsx_vilvh_b(_t1, _t0); \
+ _t6 = __lsx_vilvl_b(_t3, _t2); \
+ _t7 = __lsx_vilvh_b(_t3, _t2); \
+ _out0 = __lsx_vilvl_w(_t6, _t4); \
+ _out2 = __lsx_vilvh_w(_t6, _t4); \
+ _out4 = __lsx_vilvl_w(_t7, _t5); \
+ _out6 = __lsx_vilvh_w(_t7, _t5); \
+ _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
+ _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
+ _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
+ _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details :
+ * Example :
+ * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
+ * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
+ * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
+ * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
+ * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
+ * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
+ * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
+ * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _s0 = __lsx_vilvl_h(_in6, _in4); \
+ _s1 = __lsx_vilvl_h(_in7, _in5); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in6, _in4); \
+ _s1 = __lsx_vilvh_h(_in7, _in5); \
+ _t2 = __lsx_vilvl_h(_s1, _s0); \
+ _t3 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvl_h(_in2, _in0); \
+ _s1 = __lsx_vilvl_h(_in3, _in1); \
+ _t4 = __lsx_vilvl_h(_s1, _s0); \
+ _t5 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in2, _in0); \
+ _s1 = __lsx_vilvh_h(_in3, _in1); \
+ _t6 = __lsx_vilvl_h(_s1, _s0); \
+ _t7 = __lsx_vilvh_h(_s1, _s0); \
+ \
+ _out0 = __lsx_vpickev_d(_t0, _t4); \
+ _out2 = __lsx_vpickev_d(_t1, _t5); \
+ _out4 = __lsx_vpickev_d(_t2, _t6); \
+ _out6 = __lsx_vpickev_d(_t3, _t7); \
+ _out1 = __lsx_vpickod_d(_t0, _t4); \
+ _out3 = __lsx_vpickod_d(_t1, _t5); \
+ _out5 = __lsx_vpickod_d(_t2, _t6); \
+ _out7 = __lsx_vpickod_d(_t3, _t7); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
+ * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
+ * Return Type - as per RTYPE
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : LSX_TRANSPOSE8x4_B
+ * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ \
+ _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
+ _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
+ _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
+ _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
+ \
+ _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
+ _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
+ \
+ _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
+ _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
+ _out1 = __lsx_vilvh_d(_out2, _out0); \
+ _out3 = __lsx_vilvh_d(_out0, _out2); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ * in9, in10, in11, in12, in13, in14, in15
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details :
+ * Example :
+ * 000,001,002,003,004,005,006,007
+ * 008,009,010,011,012,013,014,015
+ * 016,017,018,019,020,021,022,023
+ * 024,025,026,027,028,029,030,031
+ * 032,033,034,035,036,037,038,039
+ * 040,041,042,043,044,045,046,047 000,008,...,112,120
+ * 048,049,050,051,052,053,054,055 001,009,...,113,121
+ * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
+ * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ * 072,073,074,075,076,077,078,079 004,012,...,116,124
+ * 080,081,082,083,084,085,086,087 005,013,...,117,125
+ * 088,089,090,091,092,093,094,095 006,014,...,118,126
+ * 096,097,098,099,100,101,102,103 007,015,...,119,127
+ * 104,105,106,107,108,109,110,111
+ * 112,113,114,115,116,117,118,119
+ * 120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+ _tmp0, _tmp1, _tmp2, _tmp3); \
+ DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
+ _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
+ DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
+ DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
+ DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
+ DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details : Butterfly operation
+ * Example :
+ * out0 = in0 + in3;
+ * out1 = in1 + in2;
+ * out2 = in1 - in2;
+ * out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in3); \
+ _out1 = __lsx_vadd_b(_in1, _in2); \
+ _out2 = __lsx_vsub_b(_in1, _in2); \
+ _out3 = __lsx_vsub_b(_in0, _in3); \
+ }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in3); \
+ _out1 = __lsx_vadd_h(_in1, _in2); \
+ _out2 = __lsx_vsub_h(_in1, _in2); \
+ _out3 = __lsx_vsub_h(_in0, _in3); \
+ }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in3); \
+ _out1 = __lsx_vadd_w(_in1, _in2); \
+ _out2 = __lsx_vsub_w(_in1, _in2); \
+ _out3 = __lsx_vsub_w(_in0, _in3); \
+ }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in3); \
+ _out1 = __lsx_vadd_d(_in1, _in2); \
+ _out2 = __lsx_vsub_d(_in1, _in2); \
+ _out3 = __lsx_vsub_d(_in0, _in3); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example :
+ * _out0 = _in0 + _in7;
+ * _out1 = _in1 + _in6;
+ * _out2 = _in2 + _in5;
+ * _out3 = _in3 + _in4;
+ * _out4 = _in3 - _in4;
+ * _out5 = _in2 - _in5;
+ * _out6 = _in1 - _in6;
+ * _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in7); \
+ _out1 = __lsx_vadd_b(_in1, _in6); \
+ _out2 = __lsx_vadd_b(_in2, _in5); \
+ _out3 = __lsx_vadd_b(_in3, _in4); \
+ _out4 = __lsx_vsub_b(_in3, _in4); \
+ _out5 = __lsx_vsub_b(_in2, _in5); \
+ _out6 = __lsx_vsub_b(_in1, _in6); \
+ _out7 = __lsx_vsub_b(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in7); \
+ _out1 = __lsx_vadd_h(_in1, _in6); \
+ _out2 = __lsx_vadd_h(_in2, _in5); \
+ _out3 = __lsx_vadd_h(_in3, _in4); \
+ _out4 = __lsx_vsub_h(_in3, _in4); \
+ _out5 = __lsx_vsub_h(_in2, _in5); \
+ _out6 = __lsx_vsub_h(_in1, _in6); \
+ _out7 = __lsx_vsub_h(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in7); \
+ _out1 = __lsx_vadd_w(_in1, _in6); \
+ _out2 = __lsx_vadd_w(_in2, _in5); \
+ _out3 = __lsx_vadd_w(_in3, _in4); \
+ _out4 = __lsx_vsub_w(_in3, _in4); \
+ _out5 = __lsx_vsub_w(_in2, _in5); \
+ _out6 = __lsx_vsub_w(_in1, _in6); \
+ _out7 = __lsx_vsub_w(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in7); \
+ _out1 = __lsx_vadd_d(_in1, _in6); \
+ _out2 = __lsx_vadd_d(_in2, _in5); \
+ _out3 = __lsx_vadd_d(_in3, _in4); \
+ _out4 = __lsx_vsub_d(_in3, _in4); \
+ _out5 = __lsx_vsub_d(_in2, _in5); \
+ _out6 = __lsx_vsub_d(_in1, _in6); \
+ _out7 = __lsx_vsub_d(_in0, _in7); \
+ }
+
+#endif // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the out vector
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed halfword
+ * Details : Signed byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplication results of adjacent odd-even elements
+ * are added to the out vector
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_b(in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Signed halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the out vector.
+ * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed double
+ * Details : Signed word elements from in_h are multiplied with
+ * signed word elements from in_l producing a result
+ * twice the size of input i.e. signed double-word.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the out vector.
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_d_w(in_h, in_l);
+ out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Unsigned halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. unsigned word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the out vector
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - per RTYPE
+ * Details : Signed halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ * out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Unsigned halfword elements from in_h are multiplied with
+ * unsigned halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Unsigned halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the in_c vector
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Multiplication result of adjacent odd-even elements
+ * are added together and subtracted from double width elements
+ * in_c vector.
+ * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ out = __lasx_xvsub_h(in_c, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Signed halfword elements from in_h are multiplied with
+ * Signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added together and subtracted from double width elements
+ * in_c vector.
+ * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * in_c : 0,0,0,0, 0,0,0,0
+ * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ * out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvsub_w(in_c, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Signed halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * four times the size of input i.e. signed doubleword.
+ * Then this multiplication results of four adjacent elements
+ * are added together and stored to the out vector.
+ * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
+ * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ * out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvhaddw_d_w(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * higher half of the two-fold sign extension (signed byte
+ * to signed halfword) and stored to the out vector.
+ * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvh_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * higher half of the two-fold sign extension (signed halfword
+ * to signed word) and stored to the out vector.
+ * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ * out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvh_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * lower half of the two-fold sign extension (signed byte
+ * to signed halfword) and stored to the out vector.
+ * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * lower half of the two-fold sign extension (signed halfword
+ * to signed word) and stored to the out vector.
+ * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ * out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvl_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The out vector and the out vector are added after the
+ * lower half of the two-fold zero extension (unsigned byte
+ * to unsigned halfword) and stored to the out vector.
+ * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_hu_bu(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_l vector after double zero extension (unsigned byte to
+ * signed halfword),added to the in_h vector.
+ * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvsllwil_hu_bu(in_l, 0);
+ out = __lasx_xvadd_h(in_h, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_l vector after double sign extension (signed halfword to
+ * signed word), added to the in_h vector.
+ * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * in_h : 0, 1,0,0, -1,0,0,1,
+ * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
+ * out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvadd_w(in_h, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ * of the lower half of the vector.
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the lower half of the two-fold sign extension (signed halfword
+ * to signed word), and the result is added to the vector in_c,
+ * then stored to the out vector.
+ * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 5,6,7,8
+ * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
+ * -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ * of the higher half of the vector.
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the higher half of the two-fold sign extension (signed
+ * halfword to signed word), and the result is added to
+ * the vector in_c, then stored to the out vector.
+ * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
+ __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ * half of the vector.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the lower half of the two-fold sign extension (signed
+ * halfword to signed word), then stored to the out vector.
+ * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ * out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvmul_w(tmp0, tmp1);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ * half of the vector.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the lower half of the two-fold sign extension (signed
+ * halfword to signed word), then stored to the out vector.
+ * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ * out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ * after being doubled, then saturated.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector adds the in_l vector after the lower half of
+ * the two-fold zero extension (unsigned byte to unsigned
+ * halfword) and then saturated. The results are stored to the out
+ * vector.
+ * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ * 0,0,0,1
+ * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+ __m256i tmp1, out;
+ __m256i zero = {0};
+
+ tmp1 = __lasx_xvilvl_b(zero, in_l);
+ out = __lasx_xvsadd_hu(in_h, tmp1);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments : Inputs - in (input vector)
+ * - min (min threshold)
+ * - max (max threshold)
+ * Outputs - in (output vector with clipped elements)
+ * Return Type - signed halfword
+ * Example : out = __lasx_xvclip_h(in, min, max)
+ * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+ __m256i out;
+
+ out = __lasx_xvmax_h(min, in);
+ out = __lasx_xvmin_h(max, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ * between 0 & 255
+ * Arguments : Inputs - in (input vector)
+ * Outputs - out (output vector with clipped elements)
+ * Return Type - signed halfword
+ * Example : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+ __m256i out;
+
+ out = __lasx_xvmaxi_h(in, 0);
+ out = __lasx_xvsat_hu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ * between 0 & 255
+ * Arguments : Inputs - in (input vector)
+ * Output - out (output vector with clipped elements)
+ * Return Type - signed word
+ * Example : out = __lasx_xvclip255_w(in)
+ * in : -8,255,280,249, -8,255,280,249
+ * out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+ __m256i out;
+
+ out = __lasx_xvmaxi_w(in, 0);
+ out = __lasx_xvsat_wu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
+ * Arguments : Inputs - in, idx
+ * Output - out
+ * Details : Idx element value from in vector is replicated to all
+ * elements in out vector.
+ * Valid index range for halfword operation is 0-7
+ * Example : out = __lasx_xvsplati_l_h(in, idx)
+ * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ * idx : 0x02
+ * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+ __m256i out;
+
+ out = __lasx_xvpermi_q(in, in, 0x02);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
+ * Arguments : Inputs - in, idx
+ * Output - out
+ * Details : Idx element value from in vector is replicated to all
+ * elements in out vector.
+ * Valid index range for halfword operation is 0-7
+ * Example : out = __lasx_xvsplati_h_h(in, idx)
+ * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ * idx : 0x09
+ * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+ __m256i out;
+
+ out = __lasx_xvpermi_q(in, in, 0x13);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Example : LASX_TRANSPOSE4x4_D
+ * _in0 : 1,2,3,4
+ * _in1 : 1,2,3,4
+ * _in2 : 1,2,3,4
+ * _in3 : 1,2,3,4
+ *
+ * _out0 : 1,1,1,1
+ * _out1 : 2,2,2,2
+ * _out2 : 3,3,3,3
+ * _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
+ _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
+ _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
+ _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
+ _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
+ _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
+ _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Example : LASX_TRANSPOSE8x8_W
+ * _in0 : 1,2,3,4,5,6,7,8
+ * _in1 : 2,2,3,4,5,6,7,8
+ * _in2 : 3,2,3,4,5,6,7,8
+ * _in3 : 4,2,3,4,5,6,7,8
+ * _in4 : 5,2,3,4,5,6,7,8
+ * _in5 : 6,2,3,4,5,6,7,8
+ * _in6 : 7,2,3,4,5,6,7,8
+ * _in7 : 8,2,3,4,5,6,7,8
+ *
+ * _out0 : 1,2,3,4,5,6,7,8
+ * _out1 : 2,2,2,2,2,2,2,2
+ * _out2 : 3,3,3,3,3,3,3,3
+ * _out3 : 4,4,4,4,4,4,4,4
+ * _out4 : 5,5,5,5,5,5,5,5
+ * _out5 : 6,6,6,6,6,6,6,6
+ * _out6 : 7,7,7,7,7,7,7,7
+ * _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_w(_in3, _in1); \
+ _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_w(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvl_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_w(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_w(_in7, _in5); \
+ _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
+ _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
+ _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
+ _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ * (input 16x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
+ _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
+ _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
+ _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
+ _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
+ _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
+ _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
+ _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
+ _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
+ _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ * (input 16x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : LASX_TRANSPOSE16x8_H
+ * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ \
+ _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Return Type - signed halfword
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in1, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in2); \
+ _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out1 = __lasx_xvilvh_d(_out0, _out0); \
+ _out3 = __lasx_xvilvh_d(_out2, _out2); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * (input 8x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x8 byte block)
+ * Example : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
+ _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
+ _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
+ _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
+ _out1 = __lasx_xvbsrl_v(_out0, 8); \
+ _out3 = __lasx_xvbsrl_v(_out2, 8); \
+ _out5 = __lasx_xvbsrl_v(_out4, 8); \
+ _out7 = __lasx_xvbsrl_v(_out6, 8); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments : Inputs - _in0, _in1, ~
+ * Outputs - _out0, _out1, ~
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : LASX_TRANSPOSE8x8_H
+ * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _s0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
+ _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
+ _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
+ _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
+ _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
+ _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
+ _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
+ _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Details : Butterfly operation
+ * Example : LASX_BUTTERFLY_4
+ * _out0 = _in0 + _in3;
+ * _out1 = _in1 + _in2;
+ * _out2 = _in1 - _in2;
+ * _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in3); \
+ _out1 = __lasx_xvadd_b(_in1, _in2); \
+ _out2 = __lasx_xvsub_b(_in1, _in2); \
+ _out3 = __lasx_xvsub_b(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in3); \
+ _out1 = __lasx_xvadd_h(_in1, _in2); \
+ _out2 = __lasx_xvsub_h(_in1, _in2); \
+ _out3 = __lasx_xvsub_h(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in3); \
+ _out1 = __lasx_xvadd_w(_in1, _in2); \
+ _out2 = __lasx_xvsub_w(_in1, _in2); \
+ _out3 = __lasx_xvsub_w(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in3); \
+ _out1 = __lasx_xvadd_d(_in1, _in2); \
+ _out2 = __lasx_xvsub_d(_in1, _in2); \
+ _out3 = __lasx_xvsub_d(_in0, _in3); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example : LASX_BUTTERFLY_8
+ * _out0 = _in0 + _in7;
+ * _out1 = _in1 + _in6;
+ * _out2 = _in2 + _in5;
+ * _out3 = _in3 + _in4;
+ * _out4 = _in3 - _in4;
+ * _out5 = _in2 - _in5;
+ * _out6 = _in1 - _in6;
+ * _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in7); \
+ _out1 = __lasx_xvadd_b(_in1, _in6); \
+ _out2 = __lasx_xvadd_b(_in2, _in5); \
+ _out3 = __lasx_xvadd_b(_in3, _in4); \
+ _out4 = __lasx_xvsub_b(_in3, _in4); \
+ _out5 = __lasx_xvsub_b(_in2, _in5); \
+ _out6 = __lasx_xvsub_b(_in1, _in6); \
+ _out7 = __lasx_xvsub_b(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in7); \
+ _out1 = __lasx_xvadd_h(_in1, _in6); \
+ _out2 = __lasx_xvadd_h(_in2, _in5); \
+ _out3 = __lasx_xvadd_h(_in3, _in4); \
+ _out4 = __lasx_xvsub_h(_in3, _in4); \
+ _out5 = __lasx_xvsub_h(_in2, _in5); \
+ _out6 = __lasx_xvsub_h(_in1, _in6); \
+ _out7 = __lasx_xvsub_h(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in7); \
+ _out1 = __lasx_xvadd_w(_in1, _in6); \
+ _out2 = __lasx_xvadd_w(_in2, _in5); \
+ _out3 = __lasx_xvadd_w(_in3, _in4); \
+ _out4 = __lasx_xvsub_w(_in3, _in4); \
+ _out5 = __lasx_xvsub_w(_in2, _in5); \
+ _out6 = __lasx_xvsub_w(_in1, _in6); \
+ _out7 = __lasx_xvsub_w(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in7); \
+ _out1 = __lasx_xvadd_d(_in1, _in6); \
+ _out2 = __lasx_xvadd_d(_in2, _in5); \
+ _out3 = __lasx_xvadd_d(_in3, _in4); \
+ _out4 = __lasx_xvsub_d(_in3, _in4); \
+ _out5 = __lasx_xvsub_d(_in2, _in5); \
+ _out6 = __lasx_xvsub_d(_in1, _in6); \
+ _out7 = __lasx_xvsub_d(_in0, _in7); \
+ }
+
+#endif // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
+ * Outputs -
+ * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ * '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ * VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter) \
+ { \
+ RTYPE _tmp0 = (RTYPE)in0; \
+ int _i = 0; \
+ if (enter) \
+ printf("\nVP:"); \
+ for (_i = 0; _i < element_num; _i++) \
+ printf("%d,", _tmp0[_i]); \
+ }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */
diff --git a/files/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h
index 29997ce1..b9a44fcc 100644
--- a/files/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -81,25 +81,35 @@
})
#endif // !(__mips == 64)
#else // !(__mips_isa_rev >= 6)
-#define LW(psrc) \
- ({ \
- const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
- uint32_t val_m; \
- asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
- : [val_m] "=r"(val_m) \
- : [psrc_lw_m] "m"(*psrc_lw_m)); \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ uint8_t* psrc_lw_m = (uint8_t*)(psrc); \
+ uint32_t val_lw_m; \
+ \
+ __asm__ volatile( \
+ "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+ \
+ : [val_lw_m] "=&r"(val_lw_m) \
+ : [psrc_lw_m] "r"(psrc_lw_m)); \
+ \
+ val_lw_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
- uint64_t val_m = 0; \
- asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
- : [val_m] "=r"(val_m) \
- : [psrc_ld_m] "m"(*psrc_ld_m)); \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ uint8_t* psrc_ld_m = (uint8_t*)(psrc); \
+ uint64_t val_ld_m = 0; \
+ \
+ __asm__ volatile( \
+ "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+ \
+ : [val_ld_m] "=&r"(val_ld_m) \
+ : [psrc_ld_m] "r"(psrc_ld_m)); \
+ \
+ val_ld_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
@@ -140,6 +150,9 @@
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
+
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
diff --git a/files/include/libyuv/mjpeg_decoder.h b/include/libyuv/mjpeg_decoder.h
index 275f8d4c..275f8d4c 100644
--- a/files/include/libyuv/mjpeg_decoder.h
+++ b/include/libyuv/mjpeg_decoder.h
diff --git a/files/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index f6f5b3ed..f9344721 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -30,7 +30,10 @@ extern "C" {
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
#define LIBYUV_DISABLE_X86
#endif
#endif
@@ -83,6 +86,50 @@ void SetPlane(uint8_t* dst_y,
int height,
uint32_t value);
+// Convert a plane of tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int tile_height);
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int tile_height);
+
+// Convert a UV plane of tiles of 16 x H into linear U and V planes.
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int tile_height);
+
+// Convert a Y and UV plane of tiles into interlaced YUY2.
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height,
+ int tile_height);
+
// Split interleaved UV plane into separate U and V planes.
LIBYUV_API
void SplitUVPlane(const uint8_t* src_uv,
@@ -105,6 +152,72 @@ void MergeUVPlane(const uint8_t* src_u,
int width,
int height);
+// Split interleaved msb UV plane into separate lsb U and V planes.
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int depth);
+
+// Merge separate lsb U and V planes into one interleaved msb UV plane.
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height,
+ int depth);
+
+// Convert lsb plane to msb plane
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int depth);
+
+// Convert msb plane to lsb plane
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int depth);
+
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Split interleaved RGB plane into separate R, G and B planes.
LIBYUV_API
void SplitRGBPlane(const uint8_t* src_rgb,
@@ -131,6 +244,92 @@ void MergeRGBPlane(const uint8_t* src_r,
int width,
int height);
+// Split interleaved ARGB plane into separate R, G, B and A planes.
+// dst_a can be NULL to discard alpha plane.
+LIBYUV_API
+void SplitARGBPlane(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height);
+
+// Merge separate R, G, B and A planes into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+LIBYUV_API
+void MergeARGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Merge separate 'depth' bit R, G and B planes stored in lsb
+// into one interleaved XR30 plane.
+// depth should in range [10, 16]
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height,
+ int depth);
+
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved AR64 plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [1, 16]
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth);
+
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [8, 16]
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth);
+
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8_t* src_y,
@@ -178,6 +377,68 @@ int I444Copy(const uint8_t* src_y,
int width,
int height);
+// Copy I210 to I210.
+#define I210ToI210 I210Copy
+LIBYUV_API
+int I210Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I410 to I410.
+#define I410ToI410 I410Copy
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy NV12. Supports inverting.
+LIBYUV_API
+int NV12Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Copy NV21. Supports inverting.
+LIBYUV_API
+int NV21Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
@@ -245,6 +506,14 @@ int YUY2ToY(const uint8_t* src_yuy2,
int width,
int height);
+LIBYUV_API
+int UYVYToY(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8_t* src_y,
@@ -293,6 +562,22 @@ int I400Mirror(const uint8_t* src_y,
int height);
// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
@@ -304,56 +589,35 @@ int ARGBMirror(const uint8_t* src_argb,
int width,
int height);
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
+// RGB24 mirror.
LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
-// Convert I422 to ABGR.
+// Mirror a plane of data.
LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
-// Convert I422 to RGBA.
+// Mirror a plane of UV data.
LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
// Alias
#define RGB24ToRAW RAWToRGB24
@@ -566,15 +830,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
int width,
int height);
-typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
// Alpha Blend ARGB images and store to destination.
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
@@ -734,6 +989,19 @@ int ARGBBlur(const uint8_t* src_argb,
int height,
int radius);
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height);
+
// Multiply ARGB image by ARGB value.
LIBYUV_API
int ARGBShade(const uint8_t* src_argb,
@@ -759,6 +1027,21 @@ int InterpolatePlane(const uint8_t* src0,
int height,
int interpolation);
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane_16(const uint16_t* src0,
+ int src_stride0, // measured in 16 bit pixels
+ const uint16_t* src1,
+ int src_stride1,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation);
+
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
LIBYUV_API
@@ -815,7 +1098,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
int width);
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
-// shuffler is 16 bytes and must be aligned.
+// shuffler is 16 bytes.
LIBYUV_API
int ARGBShuffle(const uint8_t* src_bgra,
int src_stride_bgra,
@@ -825,6 +1108,17 @@ int ARGBShuffle(const uint8_t* src_bgra,
int width,
int height);
+// Shuffle AR64 channel order. e.g. AR64 to AB64.
+// shuffler is 16 bytes.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ const uint8_t* shuffler,
+ int width,
+ int height);
+
// Sobel ARGB effect with planar output.
LIBYUV_API
int ARGBSobelToPlane(const uint8_t* src_argb,
diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h
new file mode 100644
index 00000000..37460c4a
--- /dev/null
+++ b/include/libyuv/rotate.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate180 = 180, // Rotate 180 degrees.
+ kRotate270 = 270, // Rotate 270 degrees clockwise.
+
+ // Deprecated.
+ kRotateNone = 0,
+ kRotateClockwise = 90,
+ kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I422 frame.
+LIBYUV_API
+int I422Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I010 frame.
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I210 frame.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I410 frame.
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Convert Android420 to I420 with rotation.
+// "rotation" can be 0, 90, 180 or 270.
+LIBYUV_API
+int Android420ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode rotation);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotations for when U and V are interleaved.
+// These functions take one UV input pointer and
+// split the data into two buffers while
+// rotating them.
+// width and height expected to be half size for NV12.
+LIBYUV_API
+int SplitRotateUV(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+LIBYUV_API
+void SplitRotateUV90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+void SplitRotateUV180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+void SplitRotateUV270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void SplitTransposeUV(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/files/include/libyuv/rotate_argb.h b/include/libyuv/rotate_argb.h
index 20432949..20432949 100644
--- a/files/include/libyuv/rotate_argb.h
+++ b/include/libyuv/rotate_argb.h
diff --git a/files/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h
index 022293ee..3e6a2fef 100644
--- a/files/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@@ -28,12 +28,16 @@ extern "C" {
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
#define LIBYUV_DISABLE_X86
#endif
#endif
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// The following are available for Visual C 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+ !defined(__clang__)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
@@ -41,6 +45,8 @@ extern "C" {
// The following are available for GCC 32 or 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSE4X4_32_SSE2
+#define HAS_TRANSPOSE4X4_32_AVX2
#endif
// The following are available for 64 bit GCC:
@@ -53,6 +59,7 @@ extern "C" {
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSEWX8_NEON
#define HAS_TRANSPOSEUVWX8_NEON
+#define HAS_TRANSPOSE4X4_32_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -60,9 +67,9 @@ extern "C" {
#define HAS_TRANSPOSEUVWX16_MSA
#endif
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_TRANSPOSEWX8_MMI
-#define HAS_TRANSPOSEUVWX8_MMI
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_TRANSPOSEWX16_LSX
+#define HAS_TRANSPOSEUVWX16_LSX
#endif
void TransposeWxH_C(const uint8_t* src,
@@ -92,11 +99,6 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_MMI(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width);
void TransposeWx8_Fast_SSSE3(const uint8_t* src,
int src_stride,
uint8_t* dst,
@@ -107,6 +109,11 @@ void TransposeWx16_MSA(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width);
+void TransposeWx16_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
void TransposeWx8_Any_NEON(const uint8_t* src,
int src_stride,
@@ -118,11 +125,6 @@ void TransposeWx8_Any_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_Any_MMI(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
int src_stride,
uint8_t* dst,
@@ -133,6 +135,11 @@ void TransposeWx16_Any_MSA(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width);
+void TransposeWx16_Any_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
void TransposeUVWxH_C(const uint8_t* src,
int src_stride,
@@ -171,13 +178,6 @@ void TransposeUVWx8_NEON(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_MMI(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width);
void TransposeUVWx16_MSA(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
@@ -185,6 +185,13 @@ void TransposeUVWx16_MSA(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width);
+void TransposeUVWx16_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
void TransposeUVWx8_Any_SSE2(const uint8_t* src,
int src_stride,
@@ -200,13 +207,6 @@ void TransposeUVWx8_Any_NEON(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_Any_MMI(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width);
void TransposeUVWx16_Any_MSA(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
@@ -214,6 +214,55 @@ void TransposeUVWx16_Any_MSA(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width);
+void TransposeUVWx16_Any_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeWxH_16_C(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+void TransposeWx8_16_C(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx1_16_C(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width);
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+
+void Transpose4x4_32_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+
+void Transpose4x4_32_AVX2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+
+void Transpose4x4_32_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/row.h b/include/libyuv/row.h
index 9bb48850..46685a50 100644
--- a/files/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -11,7 +11,8 @@
#ifndef INCLUDE_LIBYUV_ROW_H_
#define INCLUDE_LIBYUV_ROW_H_
-#include <stdlib.h> // For malloc.
+#include <stddef.h> // For NULL
+#include <stdlib.h> // For malloc
#include "libyuv/basic_types.h"
@@ -30,7 +31,10 @@ extern "C" {
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
#define LIBYUV_DISABLE_X86
#endif
#endif
@@ -74,7 +78,6 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
// Conversions:
-#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
@@ -87,18 +90,13 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_H422TOARGBROW_SSSE3
#define HAS_HALFFLOATROW_SSE2
-#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
@@ -108,11 +106,13 @@ extern "C" {
#define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
+#define HAS_I444TORGB24ROW_SSSE3
+#define HAS_INTERPOLATEROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB24ROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
@@ -120,11 +120,12 @@ extern "C" {
#define HAS_NV21TORGB24ROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2
-#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_ERMS
#define HAS_SETROW_X86
@@ -137,11 +138,18 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
+#endif
// Effects:
#define HAS_ARGBADDROW_SSE2
#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBCOLORTABLEROW_X86
@@ -156,11 +164,9 @@ extern "C" {
#define HAS_ARGBSEPIAROW_SSSE3
#define HAS_ARGBSHADEROW_SSE2
#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_BLENDPLANEROW_SSSE3
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
#define HAS_RGBCOLORTABLEROW_X86
#define HAS_SOBELROW_SSE2
#define HAS_SOBELTOPLANEROW_SSE2
@@ -175,6 +181,7 @@ extern "C" {
// TODO(fbarchard): fix build error on android_full_debug=1
// https://code.google.com/p/libyuv/issues/detail?id=517
#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I444ALPHATOARGBROW_SSSE3
#endif
#endif
@@ -190,15 +197,11 @@ extern "C" {
#define HAS_ARGBPOLYNOMIALROW_AVX2
#define HAS_ARGBSHUFFLEROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_ARGBTOUVJROW_AVX2
-#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
#define HAS_HALFFLOATROW_AVX2
-// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
-#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TOARGBROW_AVX2
@@ -206,6 +209,7 @@ extern "C" {
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_I444TOARGBROW_AVX2
+#define HAS_I444TORGB24ROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
@@ -215,6 +219,8 @@ extern "C" {
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_NV21TORGB24ROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
@@ -224,13 +230,16 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
+// #define HAS_HALFFLOATROW_F16C // Enable to test half float cast
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#endif
// Effects:
#define HAS_ARGBADDROW_AVX2
-#define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
-#define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_BLENDPLANEROW_AVX2
#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
@@ -238,13 +247,14 @@ extern "C" {
// TODO(fbarchard): fix build error on android_full_debug=1
// https://code.google.com/p/libyuv/issues/detail?id=517
#define HAS_I422ALPHATOARGBROW_AVX2
+#define HAS_I444ALPHATOARGBROW_AVX2
#endif
#endif
-// The following are available for AVX2 Visual C and clangcl 32 bit:
+// The following are available for AVX2 Visual C 32 bit:
// TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
- (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+ !defined(__clang__) && defined(VISUALC_HAS_AVX2)
#define HAS_ARGB1555TOARGBROW_AVX2
#define HAS_ARGB4444TOARGBROW_AVX2
#define HAS_ARGBTOARGB1555ROW_AVX2
@@ -257,62 +267,162 @@ extern "C" {
// The following are also available on x64 Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I444ALPHATOARGBROW_SSSE3
+#define HAS_I444TOARGBROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available for gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_AB64TOARGBROW_SSSE3
#define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ABGRTOYJROW_SSSE3
+#define HAS_AR64TOARGBROW_SSSE3
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBTOAB64ROW_SSSE3
#define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR64ROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
-// I210 is for H010. 2 = 422. I for 601 vs H for 709.
+#define HAS_DETILEROW_16_SSE2
+#define HAS_DETILEROW_SSE2
+#define HAS_DETILESPLITUVROW_SSSE3
+#define HAS_DETILETOYUY2_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I212TOAR30ROW_SSSE3
+#define HAS_I212TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I410TOAR30ROW_SSSE3
+#define HAS_I410TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGEARGBROW_SSE2
#define HAS_MERGERGBROW_SSSE3
+#define HAS_MERGEXRGBROW_SSE2
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV21TOYUV24ROW_SSSE3
+#define HAS_P210TOAR30ROW_SSSE3
+#define HAS_P210TOARGBROW_SSSE3
+#define HAS_P410TOAR30ROW_SSSE3
+#define HAS_P410TOARGBROW_SSSE3
+#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
+#define HAS_RGBATOYJROW_SSSE3
+#define HAS_SPLITARGBROW_SSE2
+#define HAS_SPLITARGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
+#define HAS_SPLITXRGBROW_SSE2
+#define HAS_SPLITXRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
+#define HAS_YUY2TONVUVROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_SSSE3
+#endif
+
+#if defined(__x86_64__) || !defined(__pic__)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I210ALPHATOARGBROW_SSSE3
+#define HAS_I410ALPHATOARGBROW_SSSE3
+#endif
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__)) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_AB64TOARGBROW_AVX2
#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOYJROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
+#define HAS_AR64TOARGBROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBTOAB64ROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTOAR64ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_DETILEROW_16_AVX
+#define HAS_DIVIDEROW_16_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
+#define HAS_I212TOAR30ROW_AVX2
+#define HAS_I212TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I410TOAR30ROW_AVX2
+#define HAS_I410TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
+#define HAS_INTERPOLATEROW_16TO8_AVX2
+#define HAS_MERGEAR64ROW_AVX2
+#define HAS_MERGEARGB16TO8ROW_AVX2
+#define HAS_MERGEARGBROW_AVX2
#define HAS_MERGEUVROW_16_AVX2
+#define HAS_MERGEXR30ROW_AVX2
+#define HAS_MERGEXR64ROW_AVX2
+#define HAS_MERGEXRGB16TO8ROW_AVX2
+#define HAS_MERGEXRGBROW_AVX2
+#define HAS_MIRRORUVROW_AVX2
#define HAS_MULTIPLYROW_16_AVX2
-// TODO(fbarchard): Fix AVX2 version of YUV24
-// #define HAS_NV21TOYUV24ROW_AVX2
+#define HAS_NV21TOYUV24ROW_AVX2
+#define HAS_P210TOAR30ROW_AVX2
+#define HAS_P210TOARGBROW_AVX2
+#define HAS_P410TOAR30ROW_AVX2
+#define HAS_P410TOARGBROW_AVX2
+#define HAS_RGBATOYJROW_AVX2
+#define HAS_SPLITARGBROW_AVX2
+#define HAS_SPLITUVROW_16_AVX2
+#define HAS_SPLITXRGBROW_AVX2
+#define HAS_SWAPUVROW_AVX2
+#define HAS_YUY2TONVUVROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#endif
+
+#if defined(__x86_64__) || !defined(__pic__)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I210ALPHATOARGBROW_AVX2
+#define HAS_I410ALPHATOARGBROW_AVX2
+#endif
#endif
// The following are available for AVX512 clang x86 platforms:
// TODO(fbarchard): Port to GCC and Visual C
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
- (defined(CLANG_HAS_AVX512))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
#define HAS_ARGBTORGB24ROW_AVX512VBMI
+#define HAS_MERGEUVROW_AVX512BW
+#endif
+
+// The following are available for AVX512 clang x64 platforms:
+// TODO(fbarchard): Port to x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
+ (defined(CLANG_HAS_AVX512))
+#define HAS_I422TOARGBROW_AVX512BW
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_AB64TOARGBROW_NEON
+#define HAS_ABGRTOUVJROW_NEON
#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYJROW_NEON
#define HAS_ABGRTOYROW_NEON
+#define HAS_AR64TOARGBROW_NEON
#define HAS_ARGB1555TOARGBROW_NEON
#define HAS_ARGB1555TOUVROW_NEON
#define HAS_ARGB1555TOYROW_NEON
@@ -321,6 +431,8 @@ extern "C" {
#define HAS_ARGB4444TOYROW_NEON
#define HAS_ARGBEXTRACTALPHAROW_NEON
#define HAS_ARGBSETROW_NEON
+#define HAS_ARGBTOAB64ROW_NEON
+#define HAS_ARGBTOAR64ROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTORAWROW_NEON
@@ -338,8 +450,16 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_BYTETOFLOATROW_NEON
+#define HAS_CONVERT16TO8ROW_NEON
#define HAS_COPYROW_NEON
+#define HAS_DETILEROW_16_NEON
+#define HAS_DETILEROW_NEON
+#define HAS_DETILESPLITUVROW_NEON
+#define HAS_DETILETOYUY2_NEON
+#define HAS_UNPACKMT2T_NEON
+#define HAS_DIVIDEROW_16_NEON
#define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I422ALPHATOARGBROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
@@ -350,11 +470,25 @@ extern "C" {
#define HAS_I422TORGBAROW_NEON
#define HAS_I422TOUYVYROW_NEON
#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444ALPHATOARGBROW_NEON
#define HAS_I444TOARGBROW_NEON
+#define HAS_I444TORGB24ROW_NEON
+#define HAS_INTERPOLATEROW_16_NEON
+#define HAS_INTERPOLATEROW_NEON
#define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEAR64ROW_NEON
+#define HAS_MERGEARGB16TO8ROW_NEON
+#define HAS_MERGEARGBROW_NEON
+#define HAS_MERGEUVROW_16_NEON
#define HAS_MERGEUVROW_NEON
+#define HAS_MERGEXR30ROW_NEON
+#define HAS_MERGEXR64ROW_NEON
+#define HAS_MERGEXRGB16TO8ROW_NEON
+#define HAS_MERGEXRGBROW_NEON
#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
#define HAS_MIRRORUVROW_NEON
+#define HAS_MULTIPLYROW_16_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB24ROW_NEON
#define HAS_NV12TORGB565ROW_NEON
@@ -363,25 +497,35 @@ extern "C" {
#define HAS_NV21TOYUV24ROW_NEON
#define HAS_RAWTOARGBROW_NEON
#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
+#define HAS_RAWTOUVJROW_NEON
#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVJROW_NEON
#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
#define HAS_RGB24TOYROW_NEON
#define HAS_RGB565TOARGBROW_NEON
#define HAS_RGB565TOUVROW_NEON
#define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
+#define HAS_SPLITARGBROW_NEON
#define HAS_SPLITRGBROW_NEON
+#define HAS_SPLITUVROW_16_NEON
#define HAS_SPLITUVROW_NEON
-#define HAS_UVToVUROW_NEON
+#define HAS_SPLITXRGBROW_NEON
+#define HAS_SWAPUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TONVUVROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
@@ -399,7 +543,7 @@ extern "C" {
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
-#define HAS_INTERPOLATEROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
#define HAS_SOBELROW_NEON
#define HAS_SOBELTOPLANEROW_NEON
#define HAS_SOBELXROW_NEON
@@ -409,10 +553,13 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#define HAS_FLOATDIVTOBYTEROW_NEON
+#define HAS_GAUSSCOL_F32_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_INTERPOLATEROW_16TO8_NEON
#define HAS_SCALESUMSAMPLES_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVJROW_MSA
#define HAS_ABGRTOUVROW_MSA
#define HAS_ABGRTOYROW_MSA
#define HAS_ARGB1555TOARGBROW_MSA
@@ -449,8 +596,11 @@ extern "C" {
#define HAS_HALFFLOATROW_MSA
#define HAS_I400TOARGBROW_MSA
#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TOARGB4444ROW_MSA
#define HAS_I422TOARGBROW_MSA
#define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGB565ROW_MSA
#define HAS_I422TORGBAROW_MSA
#define HAS_I422TOUYVYROW_MSA
#define HAS_I422TOYUY2ROW_MSA
@@ -459,6 +609,7 @@ extern "C" {
#define HAS_J400TOARGBROW_MSA
#define HAS_MERGEUVROW_MSA
#define HAS_MIRRORROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
#define HAS_MIRRORUVROW_MSA
#define HAS_NV12TOARGBROW_MSA
#define HAS_NV12TORGB565ROW_MSA
@@ -491,79 +642,217 @@ extern "C" {
#define HAS_YUY2TOYROW_MSA
#endif
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_ABGRTOUVROW_MMI
-#define HAS_ABGRTOYROW_MMI
-#define HAS_ARGB1555TOARGBROW_MMI
-#define HAS_ARGB1555TOUVROW_MMI
-#define HAS_ARGB1555TOYROW_MMI
-#define HAS_ARGB4444TOARGBROW_MMI
-#define HAS_ARGB4444TOUVROW_MMI
-#define HAS_ARGB4444TOYROW_MMI
-#define HAS_ARGBADDROW_MMI
-#define HAS_ARGBATTENUATEROW_MMI
-#define HAS_ARGBBLENDROW_MMI
-#define HAS_ARGBCOLORMATRIXROW_MMI
-#define HAS_ARGBCOPYALPHAROW_MMI
-#define HAS_ARGBCOPYYTOALPHAROW_MMI
-#define HAS_ARGBEXTRACTALPHAROW_MMI
-#define HAS_ARGBGRAYROW_MMI
-#define HAS_ARGBMIRRORROW_MMI
-#define HAS_ARGBMULTIPLYROW_MMI
-#define HAS_ARGBSEPIAROW_MMI
-#define HAS_ARGBSHADEROW_MMI
-#define HAS_ARGBSHUFFLEROW_MMI
-#define HAS_ARGBSUBTRACTROW_MMI
-#define HAS_ARGBTOARGB1555ROW_MMI
-#define HAS_ARGBTOARGB4444ROW_MMI
-#define HAS_ARGBTORAWROW_MMI
-#define HAS_ARGBTORGB24ROW_MMI
-#define HAS_ARGBTORGB565DITHERROW_MMI
-#define HAS_ARGBTORGB565ROW_MMI
-#define HAS_ARGBTOUV444ROW_MMI
-#define HAS_ARGBTOUVJROW_MMI
-#define HAS_ARGBTOUVROW_MMI
-#define HAS_ARGBTOYJROW_MMI
-#define HAS_ARGBTOYROW_MMI
-#define HAS_BGRATOUVROW_MMI
-#define HAS_BGRATOYROW_MMI
-#define HAS_BLENDPLANEROW_MMI
-#define HAS_COMPUTECUMULATIVESUMROW_MMI
-#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
-#define HAS_HALFFLOATROW_MMI
-#define HAS_I400TOARGBROW_MMI
-#define HAS_I422TOUYVYROW_MMI
-#define HAS_I422TOYUY2ROW_MMI
-#define HAS_INTERPOLATEROW_MMI
-#define HAS_J400TOARGBROW_MMI
-#define HAS_MERGERGBROW_MMI
-#define HAS_MERGEUVROW_MMI
-#define HAS_MIRRORROW_MMI
-#define HAS_MIRRORUVROW_MMI
-#define HAS_RAWTOARGBROW_MMI
-#define HAS_RAWTORGB24ROW_MMI
-#define HAS_RAWTOUVROW_MMI
-#define HAS_RAWTOYROW_MMI
-#define HAS_RGB24TOARGBROW_MMI
-#define HAS_RGB24TOUVROW_MMI
-#define HAS_RGB24TOYROW_MMI
-#define HAS_RGB565TOARGBROW_MMI
-#define HAS_RGB565TOUVROW_MMI
-#define HAS_RGB565TOYROW_MMI
-#define HAS_RGBATOUVROW_MMI
-#define HAS_RGBATOYROW_MMI
-#define HAS_SOBELROW_MMI
-#define HAS_SOBELTOPLANEROW_MMI
-#define HAS_SOBELXROW_MMI
-#define HAS_SOBELXYROW_MMI
-#define HAS_SOBELYROW_MMI
-#define HAS_SPLITRGBROW_MMI
-#define HAS_SPLITUVROW_MMI
-#define HAS_UYVYTOUVROW_MMI
-#define HAS_UYVYTOYROW_MMI
-#define HAS_YUY2TOUV422ROW_MMI
-#define HAS_YUY2TOUVROW_MMI
-#define HAS_YUY2TOYROW_MMI
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_ABGRTOUVROW_LSX
+#define HAS_ABGRTOYROW_LSX
+#define HAS_ARGB1555TOARGBROW_LSX
+#define HAS_ARGB1555TOUVROW_LSX
+#define HAS_ARGB1555TOYROW_LSX
+#define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ARGBADDROW_LSX
+#define HAS_ARGBATTENUATEROW_LSX
+#define HAS_ARGBBLENDROW_LSX
+#define HAS_ARGBCOLORMATRIXROW_LSX
+#define HAS_ARGBEXTRACTALPHAROW_LSX
+#define HAS_ARGBGRAYROW_LSX
+#define HAS_ARGBSEPIAROW_LSX
+#define HAS_ARGBSHADEROW_LSX
+#define HAS_ARGBSHUFFLEROW_LSX
+#define HAS_ARGBSUBTRACTROW_LSX
+#define HAS_ARGBQUANTIZEROW_LSX
+#define HAS_ARGBSETROW_LSX
+#define HAS_ARGBTOARGB1555ROW_LSX
+#define HAS_ARGBTOARGB4444ROW_LSX
+#define HAS_ARGBTORAWROW_LSX
+#define HAS_ARGBTORGB24ROW_LSX
+#define HAS_ARGBTORGB565ROW_LSX
+#define HAS_ARGBTORGB565DITHERROW_LSX
+#define HAS_ARGBTOUVJROW_LSX
+#define HAS_ARGBTOUV444ROW_LSX
+#define HAS_ARGBTOUVROW_LSX
+#define HAS_ARGBTOYJROW_LSX
+#define HAS_ARGBMIRRORROW_LSX
+#define HAS_ARGBMULTIPLYROW_LSX
+#define HAS_BGRATOUVROW_LSX
+#define HAS_BGRATOYROW_LSX
+#define HAS_I400TOARGBROW_LSX
+#define HAS_I444TOARGBROW_LSX
+#define HAS_INTERPOLATEROW_LSX
+#define HAS_I422ALPHATOARGBROW_LSX
+#define HAS_I422TOARGB1555ROW_LSX
+#define HAS_I422TOARGB4444ROW_LSX
+#define HAS_I422TORGB24ROW_LSX
+#define HAS_I422TORGB565ROW_LSX
+#define HAS_I422TORGBAROW_LSX
+#define HAS_I422TOUYVYROW_LSX
+#define HAS_I422TOYUY2ROW_LSX
+#define HAS_J400TOARGBROW_LSX
+#define HAS_MERGEUVROW_LSX
+#define HAS_MIRRORROW_LSX
+#define HAS_MIRRORUVROW_LSX
+#define HAS_MIRRORSPLITUVROW_LSX
+#define HAS_NV12TOARGBROW_LSX
+#define HAS_NV12TORGB565ROW_LSX
+#define HAS_NV21TOARGBROW_LSX
+#define HAS_RAWTOARGBROW_LSX
+#define HAS_RAWTORGB24ROW_LSX
+#define HAS_RAWTOUVROW_LSX
+#define HAS_RAWTOYROW_LSX
+#define HAS_RGB24TOARGBROW_LSX
+#define HAS_RGB24TOUVROW_LSX
+#define HAS_RGB24TOYROW_LSX
+#define HAS_RGB565TOARGBROW_LSX
+#define HAS_RGB565TOUVROW_LSX
+#define HAS_RGB565TOYROW_LSX
+#define HAS_RGBATOUVROW_LSX
+#define HAS_RGBATOYROW_LSX
+#define HAS_SETROW_LSX
+#define HAS_SOBELROW_LSX
+#define HAS_SOBELTOPLANEROW_LSX
+#define HAS_SOBELXYROW_LSX
+#define HAS_SPLITUVROW_LSX
+#define HAS_UYVYTOARGBROW_LSX
+#define HAS_UYVYTOUV422ROW_LSX
+#define HAS_UYVYTOUVROW_LSX
+#define HAS_UYVYTOYROW_LSX
+#define HAS_YUY2TOARGBROW_LSX
+#define HAS_YUY2TOUVROW_LSX
+#define HAS_YUY2TOUV422ROW_LSX
+#define HAS_YUY2TOYROW_LSX
+#define HAS_ARGBTOYROW_LSX
+#define HAS_ABGRTOYJROW_LSX
+#define HAS_RGBATOYJROW_LSX
+#define HAS_RGB24TOYJROW_LSX
+#define HAS_RAWTOYJROW_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_I422TOARGBROW_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#define HAS_ARGB1555TOARGBROW_LASX
+#define HAS_ARGB1555TOUVROW_LASX
+#define HAS_ARGB1555TOYROW_LASX
+#define HAS_ARGB4444TOARGBROW_LASX
+#define HAS_ARGBADDROW_LASX
+#define HAS_ARGBATTENUATEROW_LASX
+#define HAS_ARGBGRAYROW_LASX
+#define HAS_ARGBMIRRORROW_LASX
+#define HAS_ARGBMULTIPLYROW_LASX
+#define HAS_ARGBSEPIAROW_LASX
+#define HAS_ARGBSHADEROW_LASX
+#define HAS_ARGBSHUFFLEROW_LASX
+#define HAS_ARGBSUBTRACTROW_LASX
+#define HAS_ARGBTOARGB1555ROW_LASX
+#define HAS_ARGBTOARGB4444ROW_LASX
+#define HAS_ARGBTORAWROW_LASX
+#define HAS_ARGBTORGB24ROW_LASX
+#define HAS_ARGBTORGB565DITHERROW_LASX
+#define HAS_ARGBTORGB565ROW_LASX
+#define HAS_ARGBTOUV444ROW_LASX
+#define HAS_ARGBTOUVJROW_LASX
+#define HAS_ARGBTOUVROW_LASX
+#define HAS_ARGBTOYJROW_LASX
+#define HAS_ARGBTOYROW_LASX
+#define HAS_ABGRTOYJROW_LASX
+#define HAS_ABGRTOYROW_LASX
+#define HAS_I422ALPHATOARGBROW_LASX
+#define HAS_I422TOARGB1555ROW_LASX
+#define HAS_I422TOARGB4444ROW_LASX
+#define HAS_I422TOARGBROW_LASX
+#define HAS_I422TORGB24ROW_LASX
+#define HAS_I422TORGB565ROW_LASX
+#define HAS_I422TORGBAROW_LASX
+#define HAS_I422TOUYVYROW_LASX
+#define HAS_I422TOYUY2ROW_LASX
+#define HAS_MIRRORROW_LASX
+#define HAS_MIRRORUVROW_LASX
+#define HAS_NV12TOARGBROW_LASX
+#define HAS_NV12TORGB565ROW_LASX
+#define HAS_NV21TOARGBROW_LASX
+#define HAS_RAWTOARGBROW_LASX
+#define HAS_RAWTOUVROW_LASX
+#define HAS_RAWTOYROW_LASX
+#define HAS_RGB24TOARGBROW_LASX
+#define HAS_RGB24TOUVROW_LASX
+#define HAS_RGB24TOYROW_LASX
+#define HAS_RGB565TOARGBROW_LASX
+#define HAS_RGB565TOUVROW_LASX
+#define HAS_RGB565TOYROW_LASX
+#define HAS_UYVYTOUV422ROW_LASX
+#define HAS_UYVYTOUVROW_LASX
+#define HAS_UYVYTOYROW_LASX
+#define HAS_YUY2TOUV422ROW_LASX
+#define HAS_YUY2TOUVROW_LASX
+#define HAS_YUY2TOYROW_LASX
+#define HAS_RGBATOYROW_LASX
+#define HAS_RGBATOYJROW_LASX
+#define HAS_BGRATOYROW_LASX
+#define HAS_RGB24TOYJROW_LASX
+#define HAS_RAWTOYJROW_LASX
+#endif
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_COPYROW_RVV
+#if __riscv_v_intrinsic == 11000
+#define HAS_AB64TOARGBROW_RVV
+#define HAS_ABGRTOYJROW_RVV
+#define HAS_ABGRTOYROW_RVV
+#define HAS_AR64TOARGBROW_RVV
+#define HAS_AR64TOAB64ROW_RVV
+#define HAS_ARGBATTENUATEROW_RVV
+#define HAS_ARGBBLENDROW_RVV
+#define HAS_ARGBCOPYYTOALPHAROW_RVV
+#define HAS_ARGBEXTRACTALPHAROW_RVV
+#define HAS_ARGBTOAB64ROW_RVV
+#define HAS_ARGBTOABGRROW_RVV
+#define HAS_ARGBTOAR64ROW_RVV
+#define HAS_ARGBTOBGRAROW_RVV
+#define HAS_ARGBTORAWROW_RVV
+#define HAS_ARGBTORGB24ROW_RVV
+#define HAS_ARGBTORGBAROW_RVV
+#define HAS_ARGBTOYJROW_RVV
+#define HAS_ARGBTOYMATRIXROW_RVV
+#define HAS_ARGBTOYROW_RVV
+#define HAS_BGRATOYROW_RVV
+#define HAS_BLENDPLANEROW_RVV
+#define HAS_I400TOARGBROW_RVV
+#define HAS_I422ALPHATOARGBROW_RVV
+#define HAS_I422TOARGBROW_RVV
+#define HAS_I422TORGB24ROW_RVV
+#define HAS_I422TORGBAROW_RVV
+#define HAS_I444ALPHATOARGBROW_RVV
+#define HAS_I444TOARGBROW_RVV
+#define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
+#define HAS_J400TOARGBROW_RVV
+#define HAS_MERGEARGBROW_RVV
+#define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
+#define HAS_MERGEXRGBROW_RVV
+#define HAS_NV12TOARGBROW_RVV
+#define HAS_NV12TORGB24ROW_RVV
+#define HAS_NV21TOARGBROW_RVV
+#define HAS_NV21TORGB24ROW_RVV
+#define HAS_RAWTOARGBROW_RVV
+#define HAS_RAWTORGB24ROW_RVV
+#define HAS_RAWTORGBAROW_RVV
+#define HAS_RAWTOYJROW_RVV
+#define HAS_RAWTOYROW_RVV
+#define HAS_RGB24TOARGBROW_RVV
+#define HAS_RGB24TOYJROW_RVV
+#define HAS_RGB24TOYROW_RVV
+#define HAS_RGBATOARGBROW_RVV
+#define HAS_RGBATOYJROW_RVV
+#define HAS_RGBATOYMATRIXROW_RVV
+#define HAS_RGBATOYROW_RVV
+#define HAS_RGBTOYMATRIXROW_RVV
+#define HAS_SPLITARGBROW_RVV
+#define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
+#define HAS_SPLITXRGBROW_RVV
+#endif
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -572,8 +861,10 @@ extern "C" {
#else
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#endif
+#define LIBYUV_NOINLINE __declspec(noinline)
typedef __declspec(align(16)) int16_t vec16[8];
typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
typedef __declspec(align(16)) int8_t vec8[16];
typedef __declspec(align(16)) uint16_t uvec16[8];
typedef __declspec(align(16)) uint32_t uvec32[4];
@@ -591,8 +882,10 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#endif
+#define LIBYUV_NOINLINE __attribute__((noinline))
typedef int16_t __attribute__((vector_size(16))) vec16;
typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
typedef int8_t __attribute__((vector_size(16))) vec8;
typedef uint16_t __attribute__((vector_size(16))) uvec16;
typedef uint32_t __attribute__((vector_size(16))) uvec32;
@@ -605,8 +898,10 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32;
typedef uint8_t __attribute__((vector_size(32))) ulvec8;
#else
#define SIMD_ALIGNED(var) var
+#define LIBYUV_NOINLINE
typedef int16_t vec16[8];
typedef int32_t vec32[4];
+typedef float vecf32[4];
typedef int8_t vec8[16];
typedef uint16_t uvec16[8];
typedef uint32_t uvec32[4];
@@ -619,65 +914,40 @@ typedef uint32_t ulvec32[8];
typedef uint8_t ulvec8[32];
#endif
-#if defined(__aarch64__)
-// This struct is for Arm64 color conversion.
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// This struct is for ARM and RISC-V color conversion.
struct YuvConstants {
- uvec16 kUVToRB;
- uvec16 kUVToRB2;
- uvec16 kUVToG;
- uvec16 kUVToG2;
- vec16 kUVBiasBGR;
- vec32 kYToRgb;
-};
-#elif defined(__arm__)
-// This struct is for ArmV7 color conversion.
-struct YuvConstants {
- uvec8 kUVToRB;
- uvec8 kUVToG;
- vec16 kUVBiasBGR;
- vec32 kYToRgb;
+ uvec8 kUVCoeff;
+ vec16 kRGBCoeffBias;
};
#else
// This struct is for Intel color conversion.
struct YuvConstants {
- int8_t kUVToB[32];
- int8_t kUVToG[32];
- int8_t kUVToR[32];
- int16_t kUVBiasB[16];
- int16_t kUVBiasG[16];
- int16_t kUVBiasR[16];
+ uint8_t kUVToB[32];
+ uint8_t kUVToG[32];
+ uint8_t kUVToR[32];
int16_t kYToRgb[16];
+ int16_t kYBiasToRgb[16];
};
// Offsets into YuvConstants structure
#define KUVTOB 0
#define KUVTOG 32
#define KUVTOR 64
-#define KUVBIASB 96
-#define KUVBIASG 128
-#define KUVBIASR 160
-#define KYTORGB 192
-#endif
-
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709
+#define KYTORGB 96
+#define KYBIASTORGB 128
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
+#endif
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
-#define align_buffer_64(var, size) \
- uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \
- uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+#define align_buffer_64(var, size) \
+ void* var##_mem = malloc((size) + 63); /* NOLINT */ \
+ uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
#define free_aligned_buffer_64(var) \
free(var##_mem); \
- var = 0
+ var = NULL
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP
@@ -749,12 +1019,25 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -762,12 +1045,6 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_NEON(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I422ToRGBARow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -835,12 +1112,62 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I444ToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
@@ -848,12 +1175,36 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGBARow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -861,30 +1212,92 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGB24Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGB565Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB4444Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB1555Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
void NV12ToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
@@ -909,23 +1322,107 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
const struct YuvConstants* yuvconstants,
int width);
+void NV12ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -939,25 +1436,39 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void ARGBToUVRow_MMI(const uint8_t* src_argb0,
+void ARGBToUVRow_LSX(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVRow_LASX(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_uj,
+ uint8_t* dst_vj,
+ int width);
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
@@ -983,6 +1494,16 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
@@ -998,32 +1519,37 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVJRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1038,56 +1564,78 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
+void BGRAToUVRow_LSX(const uint8_t* src_bgra,
+ int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
+void ABGRToUVRow_LSX(const uint8_t* src_abgr,
+ int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
+void RGBAToUVRow_LSX(const uint8_t* src_rgba,
+ int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
+void ARGBToUVJRow_LSX(const uint8_t* src_argb,
+ int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
- int src_stride_rgb565,
+void ARGBToUVJRow_LASX(const uint8_t* src_argb,
+ int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
int src_stride_argb1555,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
- int src_stride_argb4444,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
+void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_LSX(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_LASX(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
@@ -1095,46 +1643,82 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width);
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-
-void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width);
+void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+
+void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1153,88 +1737,136 @@ void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
- uint8_t* dst_ptr,
- int width);
-void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+
+void BGRAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -1243,7 +1875,7 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -1256,57 +1888,81 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_LSX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void RGB24ToUVJRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVJRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -1350,96 +2006,131 @@ void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
- int src_stride_ptr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ABGRToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+void BGRAToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+void RGBAToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBToUVJRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
- int src_stride_ptr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBToUVJRow_Any_LASX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
- int src_stride_ptr,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGB1555ToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGB1555ToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_LSX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_LSX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_LSX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
- int src_stride_rgb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ABGRToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void BGRAToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void ABGRToUVRow_C(const uint8_t* src_rgb0,
+void BGRAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGBAToUVRow_C(const uint8_t* src_rgb0,
+void ABGRToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+void RGBAToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void RAWToUVRow_C(const uint8_t* src_rgb0,
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void RGB24ToUVJRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVJRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGB565ToUVRow_C(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
@@ -1474,41 +2165,59 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_MMI(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width);
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width);
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1520,7 +2229,24 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width);
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width);
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
@@ -1542,7 +2268,11 @@ void SplitUVRow_MSA(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void SplitUVRow_MMI(const uint8_t* src_uv,
+void SplitUVRow_LSX(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -1562,11 +2292,123 @@ void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+void SplitUVRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-
+void DetileRow_C(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_Any_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_SSE2(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_Any_SSE2(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_AVX(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_Any_AVX(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_16_C(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_NEON(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_Any_NEON(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_SSE2(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_Any_SSE2(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_Any_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void DetileToYUY2_C(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width);
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width);
+void DetileToYUY2_Any_SSE2(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width);
+void DetileToYUY2_NEON(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width);
+void DetileToYUY2_Any_NEON(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width);
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size);
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size);
void MergeUVRow_C(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
@@ -1579,6 +2421,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
@@ -1587,7 +2433,11 @@ void MergeUVRow_MSA(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
-void MergeUVRow_MMI(const uint8_t* src_u,
+void MergeUVRow_LSX(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
@@ -1599,6 +2449,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void MergeUVRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -1607,11 +2461,39 @@ void MergeUVRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+void MergeUVRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
@@ -1627,7 +2509,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
@@ -1642,11 +2524,6 @@ void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
-void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
- uint8_t* dst_r,
- uint8_t* dst_g,
- uint8_t* dst_b,
- int width);
void MergeRGBRow_C(const uint8_t* src_r,
const uint8_t* src_g,
@@ -1663,7 +2540,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width);
-void MergeRGBRow_MMI(const uint8_t* src_r,
+void MergeRGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_rgb,
@@ -1678,31 +2555,471 @@ void MergeRGBRow_Any_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width);
-void MergeRGBRow_Any_MMI(const uint8_t* src_r,
- const uint8_t* src_g,
- const uint8_t* src_b,
- uint8_t* dst_rgb,
+void MergeARGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width);
+void MergeARGBRow_SSE2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width);
+void MergeARGBRow_AVX2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width);
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width);
+void MergeARGBRow_RVV(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width);
+void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SplitARGBRow_C(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_RVV(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void SplitARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width);
+void MergeXRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width);
+void MergeXRGBRow_SSE2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width);
+void MergeXRGBRow_AVX2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width);
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width);
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width);
+void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeXRGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SplitXRGBRow_C(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_RVV(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+
+void MergeXR30Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width);
+void MergeAR64Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width);
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width);
+void MergeXR64Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width);
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
int width);
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width);
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width);
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width);
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width);
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width);
+void MergeXR30Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width);
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int /* depth */,
+ int width);
+void MergeAR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width);
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width);
+void MergeXR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width);
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width);
+void MergeXR30Row_Any_AVX2(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
+void MergeAR64Row_Any_AVX2(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ const uint16_t* a_buf,
+ uint16_t* dst_ptr,
+ int depth,
+ int width);
+void MergeXR64Row_Any_AVX2(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint16_t* dst_ptr,
+ int depth,
+ int width);
+void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
+void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
+void MergeXR30Row_Any_NEON(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
+void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
+void MergeAR64Row_Any_NEON(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ const uint16_t* a_buf,
+ uint16_t* dst_ptr,
+ int depth,
+ int width);
+void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
+void MergeXR64Row_Any_NEON(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint16_t* dst_ptr,
+ int depth,
+ int width);
+void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf,
+ const uint16_t* g_buf,
+ const uint16_t* b_buf,
+ uint8_t* dst_ptr,
+ int depth,
+ int width);
void MergeUVRow_16_C(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
- int scale, /* 64 for 10 bit */
+ int depth,
int width);
void MergeUVRow_16_AVX2(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
- int scale,
+ int depth,
+ int width);
+void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width);
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
int width);
+void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width);
+
+void SplitUVRow_16_C(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width);
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width);
+void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width);
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width);
+void SplitUVRow_16_Any_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width);
-void MultiplyRow_16_AVX2(const uint16_t* src_y,
- uint16_t* dst_y,
- int scale,
- int width);
void MultiplyRow_16_C(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width);
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+
+void DivideRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void DivideRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void DivideRow_16_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+void DivideRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
void Convert8To16Row_C(const uint8_t* src_y,
uint16_t* dst_y,
@@ -1745,12 +3062,21 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int width);
+void Convert16To8Row_NEON(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
+ int width);
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1761,16 +3087,12 @@ void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
- uint8_t* dst_ptr,
- int width);
void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
@@ -1785,7 +3107,10 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
uint8_t* dst_a,
int width);
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
uint8_t* dst_a,
int width);
void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
@@ -1800,31 +3125,30 @@ void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
- uint8_t* dst_ptr,
- int width);
void SetRow_C(uint8_t* dst, uint8_t v8, int width);
void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_LSX(uint8_t* dst, uint8_t v8, int width);
void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_LSX(uint8_t* dst_ptr, uint8_t v32, int width);
void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
@@ -1832,6 +3156,8 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_LSX(uint8_t* dst_ptr, uint32_t v32, int width);
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8_t* src_argb,
@@ -1854,10 +3180,14 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width);
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width);
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
@@ -1874,15 +3204,20 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
int width);
-void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBShuffleRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
int width);
+void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width);
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -1901,42 +3236,61 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width);
void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width);
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width);
void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width);
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width);
+void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width);
void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width);
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width);
+void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width);
void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width);
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width);
+void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
@@ -1956,6 +3310,9 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@@ -1985,45 +3342,59 @@ void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void RGB24ToARGBRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void RGB24ToARGBRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void RGB565ToARGBRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void RGB565ToARGBRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+void ARGB1555ToARGBRow_Any_LASX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void ARGB4444ToARGBRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
@@ -2040,15 +3411,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
uint8_t* dst_rgb,
- const uint32_t dither4,
+ uint32_t dither4,
int width);
void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
uint8_t* dst,
- const uint32_t dither4,
+ uint32_t dither4,
int width);
void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
uint8_t* dst,
- const uint32_t dither4,
+ uint32_t dither4,
int width);
void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2076,7 +3447,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
int width);
void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb,
- const uint32_t dither4,
+ uint32_t dither4,
int width);
void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2089,23 +3460,44 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
int width);
void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
uint8_t* dst_rgb,
- const uint32_t dither4,
+ uint32_t dither4,
+ int width);
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
int width);
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
+ int width);
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- const uint32_t dither4,
- int width);
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width);
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width);
+
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width);
void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2115,11 +3507,85 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+ uint8_t* dst_ar64,
+ const uint8_t* shuffler,
+ int width);
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width);
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width);
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width);
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width);
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int width);
+void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int width);
+void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int width);
+void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int width);
+void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int width);
+void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int width);
+void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -2131,7 +3597,7 @@ void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void I444ToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
@@ -2139,6 +3605,12 @@ void I444ToARGBRow_C(const uint8_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -2163,6 +3635,51 @@ void I210ToARGBRow_C(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -2207,6 +3724,27 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void P210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
void I422ToRGBARow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -2243,6 +3781,12 @@ void I422ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGBARow_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2261,18 +3805,18 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+void I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToRGB24Row_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
- uint8_t* dst_argb,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_AVX2(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2298,6 +3842,44 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToAR30Row_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2316,6 +3898,58 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2365,6 +3999,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
@@ -2400,6 +4038,48 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+
+void P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2460,6 +4140,12 @@ void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGBRow_Any_AVX512BW(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2472,12 +4158,24 @@ void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2502,6 +4200,44 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2520,6 +4256,58 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2574,9 +4362,13 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* uv_buf,
@@ -2604,6 +4396,46 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2659,30 +4491,61 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_LSX(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* param,
int width);
void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* param,
int width);
void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
+ const struct YuvConstants* param,
int width);
-void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_LSX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
// ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2690,11 +4553,15 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+void ARGBBlendRow_LSX(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_RVV(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2720,16 +4587,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
-void BlendPlaneRow_MMI(const uint8_t* src0,
+void BlendPlaneRow_RVV(const uint8_t* src0,
const uint8_t* src1,
const uint8_t* alpha,
uint8_t* dst,
int width);
-void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_ptr,
- int width);
void BlendPlaneRow_C(const uint8_t* src0,
const uint8_t* src1,
const uint8_t* alpha,
@@ -2738,11 +4600,11 @@ void BlendPlaneRow_C(const uint8_t* src0,
// ARGB multiply images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2750,7 +4612,7 @@ void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2758,7 +4620,7 @@ void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2774,21 +4636,29 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
// ARGB add images.
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2796,7 +4666,7 @@ void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2804,7 +4674,7 @@ void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2820,22 +4690,30 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBAddRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2843,7 +4721,7 @@ void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2851,7 +4729,7 @@ void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@@ -2867,14 +4745,22 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -2963,24 +4849,40 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint32_t param,
int width);
-
-void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToRGB565DitherRow_Any_LSX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+void ARGBToRGB24Row_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToRAWRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_LSX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
- uint8_t* dst_ptr,
- const uint32_t param,
- int width);
+void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
@@ -2988,12 +4890,25 @@ void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -3051,9 +4966,9 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
- const uint8_t* src_vu,
- uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
@@ -3068,24 +4983,94 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void P210ToARGBRow_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToARGBRow_Any_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P210ToAR30Row_Any_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void P410ToAR30Row_Any_NEON(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGBRow_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGBARow_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -3093,30 +5078,92 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -3141,12 +5188,55 @@ void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void NV12ToARGBRow_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_LSX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_LSX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_LSX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -3157,6 +5247,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -3167,90 +5261,130 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToUVRow_C(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width);
void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
@@ -3292,25 +5426,35 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width);
void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
int src_stride_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
int src_stride_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_C(const uint8_t* src_uyvy,
@@ -3324,7 +5468,7 @@ void UYVYToUV422Row_C(const uint8_t* src_uyvy,
int width);
void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -3334,7 +5478,7 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
int width);
void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -3344,7 +5488,7 @@ void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
int width);
void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
- int src_stride_ptr,
+ int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@@ -3353,53 +5497,67 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+void UYVYToUVRow_Any_LSX(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+void UYVYToUV422Row_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
-void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_C(const uint8_t* src_ayuv,
- int stride_ayuv,
+ int src_stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_C(const uint8_t* src_ayuv,
- int stride_ayuv,
+ int src_stride_ayuv,
uint8_t* dst_vu,
int width);
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
- int stride_ayuv,
+ int src_stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_NEON(const uint8_t* src_ayuv,
- int stride_ayuv,
+ int src_stride_ayuv,
uint8_t* dst_vu,
int width);
-void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
- int stride_ayuv,
- uint8_t* dst_uv,
+void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_vu,
int width);
-void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
- int stride_ayuv,
+void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride,
uint8_t* dst_vu,
int width);
@@ -3478,41 +5636,61 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width);
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width);
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
void I422ToUYVYRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
-void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
-void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
// Effects related row functions.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
@@ -3528,7 +5706,13 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
int width);
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
uint8_t* dst_argb,
int width);
void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
@@ -3543,9 +5727,12 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
-void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBAttenuateRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
// Inverse table for unattenuate, shared by C and SSE2.
extern const uint32_t fixed_invtbl8[256];
@@ -3569,13 +5756,15 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
void ARGBColorMatrixRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
@@ -3593,7 +5782,7 @@ void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width);
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width);
@@ -3632,6 +5821,11 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
int interval_size,
int interval_offset,
int width);
+void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
void ARGBShadeRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
@@ -3649,10 +5843,14 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
uint32_t value);
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
uint32_t value);
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
// Used for blur.
void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
@@ -3666,11 +5864,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
const int32_t* previous_cumsum,
int width);
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
- int32_t* cumsum,
- const int32_t* previous_cumsum,
- int width);
-
void CumulativeSumToAverageRow_C(const int32_t* tl,
const int32_t* bl,
int w,
@@ -3721,7 +5914,12 @@ void InterpolateRow_MSA(uint8_t* dst_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction);
-void InterpolateRow_MMI(uint8_t* dst_ptr,
+void InterpolateRow_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride,
int width,
@@ -3746,7 +5944,7 @@ void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
-void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+void InterpolateRow_Any_LSX(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
@@ -3757,6 +5955,47 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction);
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_16_Any_NEON(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+
+void InterpolateRow_16To8_C(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_16To8_Any_AVX2(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction);
// Sobel images.
void SobelXRow_C(const uint8_t* src_y0,
@@ -3779,11 +6018,6 @@ void SobelXRow_MSA(const uint8_t* src_y0,
const uint8_t* src_y2,
uint8_t* dst_sobelx,
int width);
-void SobelXRow_MMI(const uint8_t* src_y0,
- const uint8_t* src_y1,
- const uint8_t* src_y2,
- uint8_t* dst_sobelx,
- int width);
void SobelYRow_C(const uint8_t* src_y0,
const uint8_t* src_y1,
uint8_t* dst_sobely,
@@ -3800,10 +6034,6 @@ void SobelYRow_MSA(const uint8_t* src_y0,
const uint8_t* src_y1,
uint8_t* dst_sobely,
int width);
-void SobelYRow_MMI(const uint8_t* src_y0,
- const uint8_t* src_y1,
- uint8_t* dst_sobely,
- int width);
void SobelRow_C(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
@@ -3820,7 +6050,7 @@ void SobelRow_MSA(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width);
-void SobelRow_MMI(const uint8_t* src_sobelx,
+void SobelRow_LSX(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width);
@@ -3840,7 +6070,7 @@ void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_y,
int width);
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_y,
int width);
@@ -3860,7 +6090,7 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width);
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
+void SobelXYRow_LSX(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width);
@@ -3876,7 +6106,7 @@ void SobelRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void SobelRow_Any_MMI(const uint8_t* y_buf,
+void SobelRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
@@ -3892,7 +6122,7 @@ void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+void SobelToPlaneRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
@@ -3908,7 +6138,7 @@ void SobelXYRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
-void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+void SobelXYRow_Any_LSX(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
@@ -3984,6 +6214,14 @@ void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
uint16_t* dst_ptr,
float param,
int width);
+void HalfFloatRow_LSX(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_LSX(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
void ByteToFloatRow_NEON(const uint8_t* src,
float* dst,
@@ -3993,7 +6231,19 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
float* dst_ptr,
float param,
int width);
-
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
+ float* dst,
+ int width);
+// Convert a column of FP16 Half Floats to a row of FP32 Floats
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16
+ int src_stride, // stride in elements
+ float* dst,
+ int width);
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+ uint16_t* dst, // fp16
+ int width);
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
@@ -4018,16 +6268,35 @@ float ScaleSumSamples_NEON(const float* src,
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
-void FloatDivToByteRow_C(const float* src_weights,
- const float* src_values,
- uint8_t* dst_out,
- uint8_t* dst_mask,
- int width);
-void FloatDivToByteRow_NEON(const float* src_weights,
- const float* src_values,
- uint8_t* dst_out,
- uint8_t* dst_mask,
- int width);
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
+void GaussCol_C(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width);
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width);
#ifdef __cplusplus
} // extern "C"
diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h
new file mode 100644
index 00000000..bfe4a344
--- /dev/null
+++ b/include/libyuv/scale.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+ kFilterNone = 0, // Point sample; Fastest.
+ kFilterLinear = 1, // Filter horizontally only.
+ kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
+ kFilterBox = 3 // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+// Returns 0 if successful.
+LIBYUV_API
+int ScalePlane(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int ScalePlane_16(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Sample is expected to be in the low 12 bits.
+LIBYUV_API
+int ScalePlane_12(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:2:2 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+LIBYUV_API
+int I422Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I422Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I422Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API. Deprecated.
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif // __cplusplus
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/files/include/libyuv/scale_argb.h b/include/libyuv/scale_argb.h
index 7641f18e..7641f18e 100644
--- a/files/include/libyuv/scale_argb.h
+++ b/include/libyuv/scale_argb.h
diff --git a/include/libyuv/scale_rgb.h b/include/libyuv/scale_rgb.h
new file mode 100644
index 00000000..d17c39fd
--- /dev/null
+++ b/include/libyuv/scale_rgb.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_RGB_H_
+#define INCLUDE_LIBYUV_SCALE_RGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// RGB can be RAW, RGB24 or YUV24
+// RGB scales 24 bit images by converting a row at a time to ARGB
+// and using ARGB row functions to scale, then convert to RGB.
+// TODO(fbarchard): Allow input/output formats to be specified.
+LIBYUV_API
+int RGBScale(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/files/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 6e207a9c..02ed61ca 100644
--- a/files/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -29,7 +29,10 @@ extern "C" {
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
#define LIBYUV_DISABLE_X86
#endif
#endif
@@ -72,6 +75,43 @@ extern "C" {
#define HAS_SCALEROWDOWN4_SSSE3
#endif
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_SSE2
+#define HAS_SCALEROWUP2_LINEAR_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_16_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2_LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__)) && \
+ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#define HAS_SCALEROWUP2_LINEAR_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_AVX2
+#define HAS_SCALEROWUP2_LINEAR_12_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_12_AVX2
+#define HAS_SCALEROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+#endif
+
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
@@ -96,6 +136,20 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2_NEON
+#define HAS_SCALEUVROWDOWN2LINEAR_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
+#define HAS_SCALEROWUP2_LINEAR_NEON
+#define HAS_SCALEROWUP2_BILINEAR_NEON
+#define HAS_SCALEROWUP2_LINEAR_12_NEON
+#define HAS_SCALEROWUP2_BILINEAR_12_NEON
+#define HAS_SCALEROWUP2_LINEAR_16_NEON
+#define HAS_SCALEROWUP2_BILINEAR_16_NEON
+#define HAS_SCALEUVROWUP2_LINEAR_NEON
+#define HAS_SCALEUVROWUP2_BILINEAR_NEON
+#define HAS_SCALEUVROWUP2_LINEAR_16_NEON
+#define HAS_SCALEUVROWUP2_BILINEAR_16_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -111,21 +165,49 @@ extern "C" {
#define HAS_SCALEROWDOWN4_MSA
#endif
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_FIXEDDIV1_MIPS
-#define HAS_FIXEDDIV_MIPS
-#define HAS_SCALEADDROW_16_MMI
-#define HAS_SCALEADDROW_MMI
-#define HAS_SCALEARGBCOLS_MMI
-#define HAS_SCALEARGBCOLSUP2_MMI
-#define HAS_SCALEARGBROWDOWN2_MMI
-#define HAS_SCALEARGBROWDOWNEVEN_MMI
-#define HAS_SCALECOLS_16_MMI
-#define HAS_SCALECOLS_MMI
-#define HAS_SCALEROWDOWN2_16_MMI
-#define HAS_SCALEROWDOWN2_MMI
-#define HAS_SCALEROWDOWN4_16_MMI
-#define HAS_SCALEROWDOWN4_MMI
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_SCALEARGBROWDOWN2_LSX
+#define HAS_SCALEARGBROWDOWNEVEN_LSX
+#define HAS_SCALEROWDOWN2_LSX
+#define HAS_SCALEROWDOWN4_LSX
+#define HAS_SCALEROWDOWN38_LSX
+#define HAS_SCALEFILTERCOLS_LSX
+#define HAS_SCALEADDROW_LSX
+#define HAS_SCALEARGBCOLS_LSX
+#define HAS_SCALEARGBFILTERCOLS_LSX
+#define HAS_SCALEROWDOWN34_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_SCALEADDROW_RVV
+// TODO: Test ScaleARGBRowDownEven_RVV and enable it
+// #define HAS_SCALEARGBROWDOWNEVEN_RVV
+#define HAS_SCALEUVROWDOWN4_RVV
+#define HAS_SCALEUVROWDOWNEVEN_RVV
+#if __riscv_v_intrinsic == 11000
+#define HAS_SCALEARGBROWDOWN2_RVV
+#define HAS_SCALEARGBROWDOWN2BOX_RVV
+#define HAS_SCALEARGBROWDOWN2LINEAR_RVV
+#define HAS_SCALEARGBROWDOWNEVENBOX_RVV
+#define HAS_SCALEROWDOWN2_RVV
+#define HAS_SCALEROWDOWN2BOX_RVV
+#define HAS_SCALEROWDOWN2LINEAR_RVV
+#define HAS_SCALEROWDOWN34_0_BOX_RVV
+#define HAS_SCALEROWDOWN34_1_BOX_RVV
+#define HAS_SCALEROWDOWN34_RVV
+#define HAS_SCALEROWDOWN38_2_BOX_RVV
+#define HAS_SCALEROWDOWN38_3_BOX_RVV
+#define HAS_SCALEROWDOWN38_RVV
+#define HAS_SCALEROWDOWN4_RVV
+#define HAS_SCALEROWDOWN4BOX_RVV
+#define HAS_SCALEROWUP2_BILINEAR_RVV
+#define HAS_SCALEROWUP2_LINEAR_RVV
+#define HAS_SCALEUVROWDOWN2_RVV
+#define HAS_SCALEUVROWDOWN2BOX_RVV
+#define HAS_SCALEUVROWDOWN2LINEAR_RVV
+#define HAS_SCALEUVROWUP2_BILINEAR_RVV
+#define HAS_SCALEUVROWUP2_LINEAR_RVV
+#endif
#endif
// Scale ARGB vertically with bilinear interpolation.
@@ -155,6 +237,31 @@ void ScalePlaneVertical_16(int src_height,
int wpp,
enum FilterMode filtering);
+void ScalePlaneVertical_16To8(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ int scale,
+ enum FilterMode filtering);
+
+void ScalePlaneDown2_16To8(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
+ enum FilterMode filtering);
+
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width,
int src_height,
@@ -200,6 +307,16 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width);
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale);
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale);
void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -208,6 +325,16 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width);
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale);
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale);
void ScaleRowDown2Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -220,6 +347,16 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width);
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale);
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale);
void ScaleRowDown4_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -260,6 +397,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* d,
int dst_width);
+
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
int dst_width,
@@ -375,6 +546,87 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
int dst_width,
int x32,
int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int,
+ int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
@@ -442,6 +694,120 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
+
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -592,16 +958,6 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
int dst_width,
int x,
int dx);
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx);
-void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -628,6 +984,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@@ -640,15 +1008,15 @@ void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width);
@@ -688,15 +1056,15 @@ void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDown2_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDown2Linear_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDown2Box_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
@@ -730,12 +1098,22 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
int src_stepx,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_argb,
@@ -770,16 +1148,285 @@ void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
int src_stepx,
uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDownEven_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDownEvenBox_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown4_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_ptr,
int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
@@ -891,6 +1538,55 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
@@ -1012,93 +1708,184 @@ void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width);
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
- uint32_t* dst_ptr,
- int src_width);
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx);
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
- const uint16_t* src_ptr,
+void ScaleRowDown38_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx);
-
-void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_LSX(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown34_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+void ScaleRowDown2_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Linear_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Box_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4Box_Any_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
-void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown38_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_LSX(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width);
+void ScaleFilterCols_Any_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_Any_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown34_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/include/libyuv/scale_uv.h b/include/libyuv/scale_uv.h
new file mode 100644
index 00000000..8e74e319
--- /dev/null
+++ b/include/libyuv/scale_uv.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scale a 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/files/include/libyuv/version.h b/include/libyuv/version.h
index 741ef34d..a9c54400 100644
--- a/files/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1732
+#define LIBYUV_VERSION 1883
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/files/include/libyuv/video_common.h b/include/libyuv/video_common.h
index ffcbdbf1..32b8a521 100644
--- a/files/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@@ -50,7 +50,7 @@ extern "C" {
// Secondary formats are converted in 2 steps.
// Auxilliary formats call primary converters.
enum FourCC {
- // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -59,17 +59,20 @@ enum FourCC {
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
- FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
+ FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420
+ FOURCC_I210 = FOURCC('I', '2', '1', '0'), // bt.601 10 bit 422
- // 1 Secondary YUV format: row biplanar.
+ // 1 Secondary YUV format: row biplanar. deprecated.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
+ // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010.
FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit
+ FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel.
+ FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
@@ -80,15 +83,36 @@ enum FourCC {
// 1 Primary Compressed YUV format.
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
- FOURCC_J420 = FOURCC('J', '4', '2', '0'),
- FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
- FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
- FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc
+ FOURCC_J420 =
+ FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J422 =
+ FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J444 =
+ FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J400 =
+ FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_F420 = FOURCC('F', '4', '2', '0'), // bt.709 full, unofficial fourcc
+ FOURCC_F422 = FOURCC('F', '4', '2', '2'), // bt.709 full, unofficial fourcc
+ FOURCC_F444 = FOURCC('F', '4', '4', '4'), // bt.709 full, unofficial fourcc
+ FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc
+ FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc
+ FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc
+ FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc
+ FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc
+ FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc
+ FOURCC_F010 = FOURCC('F', '0', '1', '0'), // bt.709 full range 10 bit 420
+ FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420
+ FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420
+ FOURCC_F210 = FOURCC('F', '2', '1', '0'), // bt.709 full range 10 bit 422
+ FOURCC_H210 = FOURCC('H', '2', '1', '0'), // bt.709 10 bit 422
+ FOURCC_U210 = FOURCC('U', '2', '1', '0'), // bt.2020 10 bit 422
+ FOURCC_P010 = FOURCC('P', '0', '1', '0'),
+ FOURCC_P210 = FOURCC('P', '2', '1', '0'),
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -133,7 +157,7 @@ enum FourCCBpp {
FOURCC_BPP_NV12 = 12,
FOURCC_BPP_YUY2 = 16,
FOURCC_BPP_UYVY = 16,
- FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_M420 = 12, // deprecated
FOURCC_BPP_Q420 = 12,
FOURCC_BPP_ARGB = 32,
FOURCC_BPP_BGRA = 32,
@@ -141,6 +165,8 @@ enum FourCCBpp {
FOURCC_BPP_RGBA = 32,
FOURCC_BPP_AR30 = 32,
FOURCC_BPP_AB30 = 32,
+ FOURCC_BPP_AR64 = 64,
+ FOURCC_BPP_AB64 = 64,
FOURCC_BPP_24BG = 24,
FOURCC_BPP_RAW = 24,
FOURCC_BPP_RGBP = 16,
@@ -158,7 +184,12 @@ enum FourCCBpp {
FOURCC_BPP_J400 = 8,
FOURCC_BPP_H420 = 12,
FOURCC_BPP_H422 = 16,
- FOURCC_BPP_H010 = 24,
+ FOURCC_BPP_I010 = 15,
+ FOURCC_BPP_I210 = 20,
+ FOURCC_BPP_H010 = 15,
+ FOURCC_BPP_H210 = 20,
+ FOURCC_BPP_P010 = 15,
+ FOURCC_BPP_P210 = 20,
FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12,
diff --git a/infra/config/OWNERS b/infra/config/OWNERS
new file mode 100644
index 00000000..2c4f90a0
--- /dev/null
+++ b/infra/config/OWNERS
@@ -0,0 +1,3 @@
+fbarchard@chromium.org
+mbonadei@chromium.org
+jansson@google.com
diff --git a/infra/config/PRESUBMIT.py b/infra/config/PRESUBMIT.py
new file mode 100644
index 00000000..f79e08ad
--- /dev/null
+++ b/infra/config/PRESUBMIT.py
@@ -0,0 +1,13 @@
+# Copyright 2018 The PDFium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+USE_PYTHON3 = True
+
+
+def CheckChangeOnUpload(input_api, output_api):
+ return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
+
+
+def CheckChangeOnCommit(input_api, output_api):
+ return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/infra/config/README.md b/infra/config/README.md
new file mode 100644
index 00000000..e5e3b5f8
--- /dev/null
+++ b/infra/config/README.md
@@ -0,0 +1,2 @@
+This folder contains libyuv project-wide configurations
+for chrome-infra services.
diff --git a/files/codereview.settings b/infra/config/codereview.settings
index 00ba1d37..6d742273 100644
--- a/files/codereview.settings
+++ b/infra/config/codereview.settings
@@ -1,6 +1,6 @@
-# This file is used by git cl to get repository specific information.
+# This file is used by gcl and git-cl to get repository specific information.
CODE_REVIEW_SERVER: codereview.chromium.org
-GERRIT_HOST: True
PROJECT: libyuv
-TRY_ON_UPLOAD: False
+GERRIT_HOST: True
VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
+
diff --git a/infra/config/commit-queue.cfg b/infra/config/commit-queue.cfg
new file mode 100644
index 00000000..4a8d77f4
--- /dev/null
+++ b/infra/config/commit-queue.cfg
@@ -0,0 +1,143 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see Config message:
+# https://luci-config.appspot.com/schemas/projects:commit-queue.cfg
+
+cq_status_host: "chromium-cq-status.appspot.com"
+submit_options {
+ max_burst: 4
+ burst_delay {
+ seconds: 480
+ }
+}
+config_groups {
+ name: "config"
+ gerrit {
+ url: "https://chromium-review.googlesource.com"
+ projects {
+ name: "libyuv/libyuv"
+ ref_regexp: "refs/heads/infra/config"
+ }
+ }
+ verifiers {
+ gerrit_cq_ability {
+ committer_list: "project-libyuv-committers"
+ dry_run_access_list: "project-libyuv-tryjob-access"
+ }
+ tryjob {
+ builders {
+ name: "libyuv/try/presubmit"
+ }
+ retry_config {
+ single_quota: 1
+ global_quota: 2
+ failure_weight: 1
+ transient_failure_weight: 1
+ timeout_weight: 2
+ }
+ }
+ }
+}
+config_groups {
+ name: "master"
+ gerrit {
+ url: "https://chromium-review.googlesource.com"
+ projects {
+ name: "libyuv/libyuv"
+ ref_regexp: "refs/heads/main"
+ ref_regexp: "refs/heads/master"
+ }
+ }
+ verifiers {
+ gerrit_cq_ability {
+ committer_list: "project-libyuv-committers"
+ dry_run_access_list: "project-libyuv-tryjob-access"
+ }
+ tryjob {
+ builders {
+ name: "libyuv/try/android"
+ experiment_percentage: 100
+ }
+ builders {
+ name: "libyuv/try/android_arm64"
+ experiment_percentage: 100
+ }
+ builders {
+ name: "libyuv/try/android_rel"
+ experiment_percentage: 100
+ }
+ builders {
+ name: "libyuv/try/android_x64"
+ }
+ builders {
+ name: "libyuv/try/android_x86"
+ }
+ builders {
+ name: "libyuv/try/ios_arm64"
+ }
+ builders {
+ name: "libyuv/try/ios_arm64_rel"
+ }
+ builders {
+ name: "libyuv/try/linux"
+ }
+ builders {
+ name: "libyuv/try/linux_asan"
+ }
+ builders {
+ name: "libyuv/try/linux_gcc"
+ experiment_percentage: 100
+ }
+ builders {
+ name: "libyuv/try/linux_msan"
+ }
+ builders {
+ name: "libyuv/try/linux_rel"
+ }
+ builders {
+ name: "libyuv/try/linux_tsan2"
+ }
+ builders {
+ name: "libyuv/try/linux_ubsan"
+ }
+ builders {
+ name: "libyuv/try/linux_ubsan_vptr"
+ }
+ builders {
+ name: "libyuv/try/mac"
+ }
+ builders {
+ name: "libyuv/try/mac_asan"
+ }
+ builders {
+ name: "libyuv/try/mac_rel"
+ }
+ builders {
+ name: "libyuv/try/win"
+ }
+ builders {
+ name: "libyuv/try/win_clang"
+ }
+ builders {
+ name: "libyuv/try/win_clang_rel"
+ }
+ builders {
+ name: "libyuv/try/win_rel"
+ }
+ builders {
+ name: "libyuv/try/win_x64_clang_rel"
+ }
+ builders {
+ name: "libyuv/try/win_x64_rel"
+ }
+ retry_config {
+ single_quota: 1
+ global_quota: 2
+ failure_weight: 1
+ transient_failure_weight: 1
+ timeout_weight: 2
+ }
+ }
+ }
+}
diff --git a/infra/config/cr-buildbucket.cfg b/infra/config/cr-buildbucket.cfg
new file mode 100644
index 00000000..7415851b
--- /dev/null
+++ b/infra/config/cr-buildbucket.cfg
@@ -0,0 +1,1704 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see BuildbucketCfg message:
+# https://luci-config.appspot.com/schemas/projects:buildbucket.cfg
+
+buckets {
+ name: "ci"
+ acls {
+ role: WRITER
+ group: "project-libyuv-admins"
+ }
+ acls {
+ group: "all"
+ }
+ swarming {
+ builders {
+ name: "Android ARM64 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android Tester ARM32 Debug (Nexus 5X)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "device_type:walleye"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android Tester ARM32 Release (Nexus 5X)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "device_type:walleye"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android Tester ARM64 Debug (Nexus 5X)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "device_type:walleye"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android32 x86 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Android64 x64 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux Asan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux MSan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux Tsan v2"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux UBSan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux UBSan vptr"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux32 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux32 Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux64 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Linux64 Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Mac Asan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Mac64 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Mac64 Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win32 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win32 Debug (Clang)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win32 Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win32 Release (Clang)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win64 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win64 Debug (Clang)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win64 Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "Win64 Release (Clang)"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "iOS ARM64 Debug"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "iOS ARM64 Release"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.ci"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-trusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "client.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ }
+}
+buckets {
+ name: "cron"
+ acls {
+ role: WRITER
+ group: "project-libyuv-admins"
+ }
+ acls {
+ group: "all"
+ }
+ swarming {
+ builders {
+ name: "DEPS Autoroller"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Linux"
+ dimensions: "pool:luci.webrtc.cron"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "recipe": "libyuv/roll_deps"'
+ '}'
+ execution_timeout_secs: 7200
+ build_numbers: YES
+ service_account: "libyuv-ci-autoroll-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ }
+}
+buckets {
+ name: "try"
+ acls {
+ role: WRITER
+ group: "project-libyuv-admins"
+ }
+ acls {
+ group: "all"
+ }
+ acls {
+ role: SCHEDULER
+ group: "project-libyuv-tryjob-access"
+ }
+ acls {
+ role: SCHEDULER
+ group: "service-account-cq"
+ }
+ swarming {
+ builders {
+ name: "android"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "device_type:walleye"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "android_arm64"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "device_type:walleye"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "android_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "device_type:walleye"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "android_x64"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "android_x86"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "ios_arm64"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "ios_arm64_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_asan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_gcc"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_msan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_tsan2"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_ubsan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "linux_ubsan_vptr"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "mac"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "mac_asan"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "mac_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Mac-12"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "presubmit"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Ubuntu-18.04"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "run_presubmit",'
+ ' "repo_name": "libyuv",'
+ ' "runhooks": true'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "win"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "win_clang"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "win_clang_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "win_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "win_x64_clang_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ builders {
+ name: "win_x64_rel"
+ swarming_host: "chromium-swarm.appspot.com"
+ swarming_tags: "vpython:native-python-wrapper"
+ dimensions: "cores:8"
+ dimensions: "cpu:x86-64"
+ dimensions: "os:Windows-10"
+ dimensions: "pool:luci.flex.try"
+ exe {
+ cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+ cipd_version: "refs/heads/main"
+ cmd: "luciexe"
+ }
+ properties:
+ '{'
+ ' "$build/reclient": {'
+ ' "instance": "rbe-webrtc-untrusted",'
+ ' "metrics_project": "chromium-reclient-metrics"'
+ ' },'
+ ' "builder_group": "tryserver.libyuv",'
+ ' "recipe": "libyuv/libyuv"'
+ '}'
+ execution_timeout_secs: 10800
+ build_numbers: YES
+ service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ experiments {
+ key: "luci.recipes.use_python3"
+ value: 100
+ }
+ }
+ }
+}
diff --git a/infra/config/luci-logdog.cfg b/infra/config/luci-logdog.cfg
new file mode 100644
index 00000000..adc75bef
--- /dev/null
+++ b/infra/config/luci-logdog.cfg
@@ -0,0 +1,9 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see ProjectConfig message:
+# https://luci-config.appspot.com/schemas/projects:luci-logdog.cfg
+
+reader_auth_groups: "all"
+writer_auth_groups: "luci-logdog-chromium-writers"
+archive_gs_bucket: "chromium-luci-logdog"
diff --git a/infra/config/luci-milo.cfg b/infra/config/luci-milo.cfg
new file mode 100644
index 00000000..baf786f2
--- /dev/null
+++ b/infra/config/luci-milo.cfg
@@ -0,0 +1,246 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see Project message:
+# https://luci-config.appspot.com/schemas/projects:luci-milo.cfg
+
+consoles {
+ id: "main"
+ name: "libyuv Main Console"
+ repo_url: "https://chromium.googlesource.com/libyuv/libyuv"
+ refs: "regexp:refs/heads/main"
+ manifest_name: "REVISION"
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android ARM64 Debug"
+ category: "Android|Builder"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android Debug"
+ category: "Android|Builder"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android Release"
+ category: "Android|Builder"
+ short_name: "rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android32 x86 Debug"
+ category: "Android|Builder|x86"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android64 x64 Debug"
+ category: "Android|Builder|x64"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android Tester ARM32 Debug (Nexus 5X)"
+ category: "Android|Tester|ARM 32"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android Tester ARM32 Release (Nexus 5X)"
+ category: "Android|Tester|ARM 32"
+ short_name: "rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Android Tester ARM64 Debug (Nexus 5X)"
+ category: "Android|Tester|ARM 64"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux Asan"
+ category: "Linux"
+ short_name: "asan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux MSan"
+ category: "Linux"
+ short_name: "msan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux Tsan v2"
+ category: "Linux"
+ short_name: "tsan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux UBSan"
+ category: "Linux|UBSan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux UBSan vptr"
+ category: "Linux|UBSan"
+ short_name: "vptr"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux32 Debug"
+ category: "Linux|32"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux32 Release"
+ category: "Linux|32"
+ short_name: "rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux64 Debug"
+ category: "Linux|64"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Linux64 Release"
+ category: "Linux|64"
+ short_name: "rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Mac Asan"
+ category: "Mac"
+ short_name: "asan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Mac64 Debug"
+ category: "Mac"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Mac64 Release"
+ category: "Mac"
+ short_name: "rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win32 Debug"
+ category: "Win|32|Debug"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win32 Debug (Clang)"
+ category: "Win|32|Debug"
+ short_name: "clg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win32 Release"
+ category: "Win|32|Release"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win32 Release (Clang)"
+ category: "Win|32|Release"
+ short_name: "clg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win64 Debug"
+ category: "Win|64|Debug"
+ short_name: "clg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win64 Debug (Clang)"
+ category: "Win|64|Debug"
+ short_name: "clg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win64 Release"
+ category: "Win|64|Release"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/Win64 Release (Clang)"
+ category: "Win|64|Release"
+ short_name: "clg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/iOS ARM64 Debug"
+ category: "iOS|ARM64"
+ short_name: "dbg"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.ci/iOS ARM64 Release"
+ category: "iOS|ARM64"
+ short_name: "rel"
+ }
+ include_experimental_builds: true
+}
+consoles {
+ id: "cron"
+ name: "Cron"
+ builders {
+ name: "buildbucket/luci.libyuv.cron/DEPS Autoroller"
+ }
+ builder_view_only: true
+}
+consoles {
+ id: "try"
+ name: "libyuv Try Builders"
+ builders {
+ name: "buildbucket/luci.libyuv.try/android"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/android_arm64"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/android_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/android_x64"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/android_x86"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/ios_arm64"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/ios_arm64_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_asan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_gcc"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_msan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_tsan2"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_ubsan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/linux_ubsan_vptr"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/mac"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/mac_asan"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/mac_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/win"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/win_clang"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/win_clang_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/win_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/win_x64_clang_rel"
+ }
+ builders {
+ name: "buildbucket/luci.libyuv.try/win_x64_rel"
+ }
+ builder_view_only: true
+}
+logo_url: "https://storage.googleapis.com/chrome-infra-public/logo/libyuv-logo.png"
diff --git a/infra/config/luci-scheduler.cfg b/infra/config/luci-scheduler.cfg
new file mode 100644
index 00000000..0ec5dd0e
--- /dev/null
+++ b/infra/config/luci-scheduler.cfg
@@ -0,0 +1,385 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see ProjectConfig message:
+# https://luci-config.appspot.com/schemas/projects:luci-scheduler.cfg
+
+job {
+ id: "Android ARM64 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android ARM64 Debug"
+ }
+}
+job {
+ id: "Android Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android Debug"
+ }
+}
+job {
+ id: "Android Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android Release"
+ }
+}
+job {
+ id: "Android Tester ARM32 Debug (Nexus 5X)"
+ realm: "ci"
+ acls {
+ role: TRIGGERER
+ granted_to: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android Tester ARM32 Debug (Nexus 5X)"
+ }
+}
+job {
+ id: "Android Tester ARM32 Release (Nexus 5X)"
+ realm: "ci"
+ acls {
+ role: TRIGGERER
+ granted_to: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android Tester ARM32 Release (Nexus 5X)"
+ }
+}
+job {
+ id: "Android Tester ARM64 Debug (Nexus 5X)"
+ realm: "ci"
+ acls {
+ role: TRIGGERER
+ granted_to: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android Tester ARM64 Debug (Nexus 5X)"
+ }
+}
+job {
+ id: "Android32 x86 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android32 x86 Debug"
+ }
+}
+job {
+ id: "Android64 x64 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Android64 x64 Debug"
+ }
+}
+job {
+ id: "DEPS Autoroller"
+ realm: "cron"
+ schedule: "0 14 * * *"
+ acl_sets: "cron"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "cron"
+ builder: "DEPS Autoroller"
+ }
+}
+job {
+ id: "Linux Asan"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux Asan"
+ }
+}
+job {
+ id: "Linux MSan"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux MSan"
+ }
+}
+job {
+ id: "Linux Tsan v2"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux Tsan v2"
+ }
+}
+job {
+ id: "Linux UBSan"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux UBSan"
+ }
+}
+job {
+ id: "Linux UBSan vptr"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux UBSan vptr"
+ }
+}
+job {
+ id: "Linux32 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux32 Debug"
+ }
+}
+job {
+ id: "Linux32 Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux32 Release"
+ }
+}
+job {
+ id: "Linux64 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux64 Debug"
+ }
+}
+job {
+ id: "Linux64 Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Linux64 Release"
+ }
+}
+job {
+ id: "Mac Asan"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Mac Asan"
+ }
+}
+job {
+ id: "Mac64 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Mac64 Debug"
+ }
+}
+job {
+ id: "Mac64 Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Mac64 Release"
+ }
+}
+job {
+ id: "Win32 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win32 Debug"
+ }
+}
+job {
+ id: "Win32 Debug (Clang)"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win32 Debug (Clang)"
+ }
+}
+job {
+ id: "Win32 Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win32 Release"
+ }
+}
+job {
+ id: "Win32 Release (Clang)"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win32 Release (Clang)"
+ }
+}
+job {
+ id: "Win64 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win64 Debug"
+ }
+}
+job {
+ id: "Win64 Debug (Clang)"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win64 Debug (Clang)"
+ }
+}
+job {
+ id: "Win64 Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win64 Release"
+ }
+}
+job {
+ id: "Win64 Release (Clang)"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "Win64 Release (Clang)"
+ }
+}
+job {
+ id: "iOS ARM64 Debug"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "iOS ARM64 Debug"
+ }
+}
+job {
+ id: "iOS ARM64 Release"
+ realm: "ci"
+ acl_sets: "ci"
+ buildbucket {
+ server: "cr-buildbucket.appspot.com"
+ bucket: "ci"
+ builder: "iOS ARM64 Release"
+ }
+}
+trigger {
+ id: "master-gitiles-trigger"
+ realm: "ci"
+ acl_sets: "ci"
+ triggers: "Android ARM64 Debug"
+ triggers: "Android Debug"
+ triggers: "Android Release"
+ triggers: "Android32 x86 Debug"
+ triggers: "Android64 x64 Debug"
+ triggers: "Linux Asan"
+ triggers: "Linux MSan"
+ triggers: "Linux Tsan v2"
+ triggers: "Linux UBSan"
+ triggers: "Linux UBSan vptr"
+ triggers: "Linux32 Debug"
+ triggers: "Linux32 Release"
+ triggers: "Linux64 Debug"
+ triggers: "Linux64 Release"
+ triggers: "Mac Asan"
+ triggers: "Mac64 Debug"
+ triggers: "Mac64 Release"
+ triggers: "Win32 Debug"
+ triggers: "Win32 Debug (Clang)"
+ triggers: "Win32 Release"
+ triggers: "Win32 Release (Clang)"
+ triggers: "Win64 Debug"
+ triggers: "Win64 Debug (Clang)"
+ triggers: "Win64 Release"
+ triggers: "Win64 Release (Clang)"
+ triggers: "iOS ARM64 Debug"
+ triggers: "iOS ARM64 Release"
+ gitiles {
+ repo: "https://chromium.googlesource.com/libyuv/libyuv"
+ refs: "regexp:refs/heads/main"
+ }
+}
+acl_sets {
+ name: "ci"
+ acls {
+ role: OWNER
+ granted_to: "group:project-libyuv-admins"
+ }
+ acls {
+ granted_to: "group:all"
+ }
+}
+acl_sets {
+ name: "cron"
+ acls {
+ role: OWNER
+ granted_to: "group:project-libyuv-admins"
+ }
+ acls {
+ granted_to: "group:all"
+ }
+}
diff --git a/infra/config/main.star b/infra/config/main.star
new file mode 100755
index 00000000..e83afe4f
--- /dev/null
+++ b/infra/config/main.star
@@ -0,0 +1,344 @@
+#!/usr/bin/env lucicfg
+# https://chromium.googlesource.com/infra/luci/luci-go/+/master/lucicfg/doc/
+
+"""LUCI project configuration for libyuv CQ and CI."""
+
+lucicfg.check_version("1.30.9")
+
+LIBYUV_GIT = "https://chromium.googlesource.com/libyuv/libyuv"
+LIBYUV_GERRIT = "https://chromium-review.googlesource.com/libyuv/libyuv"
+
+RECLIENT_CI = {
+ "instance": "rbe-webrtc-trusted",
+ "metrics_project": "chromium-reclient-metrics",
+}
+
+RECLIENT_CQ = {
+ "instance": "rbe-webrtc-untrusted",
+ "metrics_project": "chromium-reclient-metrics",
+}
+
+# Use LUCI Scheduler BBv2 names and add Scheduler realms configs.
+lucicfg.enable_experiment("crbug.com/1182002")
+
+luci.builder.defaults.experiments.set(
+ {
+ "luci.recipes.use_python3": 100,
+ },
+)
+
+lucicfg.config(
+ lint_checks = ["default"],
+ config_dir = ".",
+ tracked_files = [
+ "commit-queue.cfg",
+ "cr-buildbucket.cfg",
+ "luci-logdog.cfg",
+ "luci-milo.cfg",
+ "luci-scheduler.cfg",
+ "project.cfg",
+ "realms.cfg",
+ ],
+)
+
+# Generates project.cfg
+
+luci.project(
+ name = "libyuv",
+ buildbucket = "cr-buildbucket.appspot.com",
+ logdog = "luci-logdog.appspot.com",
+ milo = "luci-milo.appspot.com",
+ notify = "luci-notify.appspot.com",
+ scheduler = "luci-scheduler.appspot.com",
+ swarming = "chromium-swarm.appspot.com",
+ acls = [
+ acl.entry(acl.PROJECT_CONFIGS_READER, groups = ["all"]),
+ acl.entry(acl.LOGDOG_READER, groups = ["all"]),
+ acl.entry(acl.LOGDOG_WRITER, groups = ["luci-logdog-chromium-writers"]),
+ acl.entry(acl.SCHEDULER_READER, groups = ["all"]),
+ acl.entry(acl.SCHEDULER_OWNER, groups = ["project-libyuv-admins"]),
+ acl.entry(acl.BUILDBUCKET_READER, groups = ["all"]),
+ acl.entry(acl.BUILDBUCKET_OWNER, groups = ["project-libyuv-admins"]),
+ ],
+ bindings = [
+ luci.binding(
+ roles = "role/swarming.taskTriggerer", # for LED tasks.
+ groups = "project-libyuv-admins",
+ ),
+ luci.binding(
+ roles = "role/configs.validator",
+ users = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com",
+ ),
+ ],
+)
+
+# Generates luci-logdog.cfg
+
+luci.logdog(
+ gs_bucket = "chromium-luci-logdog",
+)
+
+# Generates luci-scheduler.cfg
+
+luci.gitiles_poller(
+ name = "master-gitiles-trigger",
+ bucket = "ci",
+ repo = LIBYUV_GIT,
+)
+
+# Generates luci-milo.cfg
+
+luci.milo(
+ logo = "https://storage.googleapis.com/chrome-infra-public/logo/libyuv-logo.png",
+)
+
+def libyuv_ci_view(name, category, short_name):
+ return luci.console_view_entry(
+ console_view = "main",
+ builder = name,
+ category = category,
+ short_name = short_name,
+ )
+
+def libyuv_try_view(name):
+ return luci.list_view_entry(
+ list_view = "try",
+ builder = name,
+ )
+
+luci.console_view(
+ name = "main",
+ title = "libyuv Main Console",
+ include_experimental_builds = True,
+ repo = LIBYUV_GIT,
+)
+
+luci.list_view(
+ name = "cron",
+ title = "Cron",
+ entries = ["DEPS Autoroller"],
+)
+
+luci.list_view(
+ name = "try",
+ title = "libyuv Try Builders",
+)
+
+# Generates commit-queue.cfg
+
+def libyuv_try_job_verifier(name, cq_group, experiment_percentage):
+ return luci.cq_tryjob_verifier(
+ builder = name,
+ cq_group = cq_group,
+ experiment_percentage = experiment_percentage,
+ )
+
+luci.cq(
+ status_host = "chromium-cq-status.appspot.com",
+ submit_max_burst = 4,
+ submit_burst_delay = 8 * time.minute,
+)
+
+luci.cq_group(
+ name = "master",
+ watch = [
+ cq.refset(
+ repo = LIBYUV_GERRIT,
+ refs = ["refs/heads/main", "refs/heads/master"],
+ ),
+ ],
+ acls = [
+ acl.entry(acl.CQ_COMMITTER, groups = ["project-libyuv-committers"]),
+ acl.entry(acl.CQ_DRY_RUNNER, groups = ["project-libyuv-tryjob-access"]),
+ ],
+ retry_config = cq.RETRY_ALL_FAILURES,
+ cancel_stale_tryjobs = True,
+)
+
+luci.cq_group(
+ name = "config",
+ watch = [
+ cq.refset(
+ repo = LIBYUV_GERRIT,
+ refs = ["refs/heads/infra/config"],
+ ),
+ ],
+ acls = [
+ acl.entry(acl.CQ_COMMITTER, groups = ["project-libyuv-committers"]),
+ acl.entry(acl.CQ_DRY_RUNNER, groups = ["project-libyuv-tryjob-access"]),
+ ],
+ retry_config = cq.RETRY_ALL_FAILURES,
+ cancel_stale_tryjobs = True,
+)
+
+# Generates cr-buildbucket.cfg
+
+luci.bucket(
+ name = "ci",
+)
+luci.bucket(
+ name = "try",
+ acls = [
+ acl.entry(acl.BUILDBUCKET_TRIGGERER, groups = [
+ "project-libyuv-tryjob-access",
+ "service-account-cq",
+ ]),
+ ],
+)
+luci.bucket(
+ name = "cron",
+)
+
+def get_os_dimensions(os):
+ if os == "android":
+ return {"device_type": "walleye"}
+ if os == "ios" or os == "mac":
+ return {"os": "Mac-12", "cpu": "x86-64"}
+ elif os == "win":
+ return {"os": "Windows-10", "cores": "8", "cpu": "x86-64"}
+ elif os == "linux":
+ return {"os": "Ubuntu-18.04", "cores": "8", "cpu": "x86-64"}
+ return {}
+
+def libyuv_ci_builder(name, dimensions, properties, triggered_by):
+ return luci.builder(
+ name = name,
+ dimensions = dimensions,
+ properties = properties,
+ bucket = "ci",
+ service_account = "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com",
+ triggered_by = triggered_by,
+ swarming_tags = ["vpython:native-python-wrapper"],
+ execution_timeout = 180 * time.minute,
+ build_numbers = True,
+ executable = luci.recipe(
+ name = "libyuv/libyuv",
+ cipd_package = "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build",
+ use_python3 = True,
+ ),
+ )
+
+def libyuv_try_builder(name, dimensions, properties, recipe_name = "libyuv/libyuv"):
+ return luci.builder(
+ name = name,
+ dimensions = dimensions,
+ properties = properties,
+ bucket = "try",
+ service_account = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com",
+ swarming_tags = ["vpython:native-python-wrapper"],
+ execution_timeout = 180 * time.minute,
+ build_numbers = True,
+ executable = luci.recipe(
+ name = recipe_name,
+ cipd_package = "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build",
+ use_python3 = True,
+ ),
+ )
+
+def ci_builder(name, os, category, short_name = None):
+ dimensions = get_os_dimensions(os)
+ properties = {"$build/reclient": RECLIENT_CI}
+
+ dimensions["pool"] = "luci.flex.ci"
+ properties["builder_group"] = "client.libyuv"
+
+ triggered_by = ["master-gitiles-trigger" if os != "android" else "Android Debug"]
+ libyuv_ci_view(name, category, short_name)
+ return libyuv_ci_builder(name, dimensions, properties, triggered_by)
+
+def try_builder(name, os, experiment_percentage = None):
+ dimensions = get_os_dimensions(os)
+ properties = {"$build/reclient": RECLIENT_CQ}
+
+ dimensions["pool"] = "luci.flex.try"
+ properties["builder_group"] = "tryserver.libyuv"
+
+ if name == "presubmit":
+ recipe_name = "run_presubmit"
+ properties["repo_name"] = "libyuv"
+ properties["runhooks"] = True
+ libyuv_try_job_verifier(name, "config", experiment_percentage)
+ return libyuv_try_builder(name, dimensions, properties, recipe_name)
+
+ libyuv_try_job_verifier(name, "master", experiment_percentage)
+ libyuv_try_view(name)
+ return libyuv_try_builder(name, dimensions, properties)
+
+luci.builder(
+ name = "DEPS Autoroller",
+ bucket = "cron",
+ service_account = "libyuv-ci-autoroll-builder@chops-service-accounts.iam.gserviceaccount.com",
+ dimensions = {
+ "pool": "luci.webrtc.cron",
+ "os": "Linux",
+ "cpu": "x86-64",
+ },
+ swarming_tags = ["vpython:native-python-wrapper"],
+ execution_timeout = 120 * time.minute,
+ build_numbers = True,
+ schedule = "0 14 * * *", # Every 2 hours.
+ executable = luci.recipe(
+ name = "libyuv/roll_deps",
+ cipd_package = "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build",
+ use_python3 = True,
+ ),
+)
+
+ci_builder("Android ARM64 Debug", "linux", "Android|Builder", "dbg")
+ci_builder("Android Debug", "linux", "Android|Builder", "dbg")
+ci_builder("Android Release", "linux", "Android|Builder", "rel")
+ci_builder("Android32 x86 Debug", "linux", "Android|Builder|x86", "dbg")
+ci_builder("Android64 x64 Debug", "linux", "Android|Builder|x64", "dbg")
+ci_builder("Android Tester ARM32 Debug (Nexus 5X)", "android", "Android|Tester|ARM 32", "dbg")
+ci_builder("Android Tester ARM32 Release (Nexus 5X)", "android", "Android|Tester|ARM 32", "rel")
+ci_builder("Android Tester ARM64 Debug (Nexus 5X)", "android", "Android|Tester|ARM 64", "dbg")
+ci_builder("Linux Asan", "linux", "Linux", "asan")
+ci_builder("Linux MSan", "linux", "Linux", "msan")
+ci_builder("Linux Tsan v2", "linux", "Linux", "tsan")
+ci_builder("Linux UBSan", "linux", "Linux|UBSan")
+ci_builder("Linux UBSan vptr", "linux", "Linux|UBSan", "vptr")
+ci_builder("Linux32 Debug", "linux", "Linux|32", "dbg")
+ci_builder("Linux32 Release", "linux", "Linux|32", "rel")
+ci_builder("Linux64 Debug", "linux", "Linux|64", "dbg")
+ci_builder("Linux64 Release", "linux", "Linux|64", "rel")
+ci_builder("Mac Asan", "mac", "Mac", "asan")
+ci_builder("Mac64 Debug", "mac", "Mac", "dbg")
+ci_builder("Mac64 Release", "mac", "Mac", "rel")
+ci_builder("Win32 Debug", "win", "Win|32|Debug")
+ci_builder("Win32 Debug (Clang)", "win", "Win|32|Debug", "clg")
+ci_builder("Win32 Release", "win", "Win|32|Release")
+ci_builder("Win32 Release (Clang)", "win", "Win|32|Release", "clg")
+ci_builder("Win64 Debug", "win", "Win|64|Debug", "clg")
+ci_builder("Win64 Debug (Clang)", "win", "Win|64|Debug", "clg")
+ci_builder("Win64 Release", "win", "Win|64|Release")
+ci_builder("Win64 Release (Clang)", "win", "Win|64|Release", "clg")
+ci_builder("iOS ARM64 Debug", "ios", "iOS|ARM64", "dbg")
+ci_builder("iOS ARM64 Release", "ios", "iOS|ARM64", "rel")
+
+# TODO(crbug.com/1242847): make this not experimental.
+try_builder("android", "android", experiment_percentage = 100)
+try_builder("android_arm64", "android", experiment_percentage = 100)
+try_builder("android_rel", "android", experiment_percentage = 100)
+
+try_builder("android_x64", "linux")
+try_builder("android_x86", "linux")
+try_builder("ios_arm64", "ios")
+try_builder("ios_arm64_rel", "ios")
+try_builder("linux", "linux")
+try_builder("linux_asan", "linux")
+try_builder("linux_gcc", "linux", experiment_percentage = 100)
+try_builder("linux_msan", "linux")
+try_builder("linux_rel", "linux")
+try_builder("linux_tsan2", "linux")
+try_builder("linux_ubsan", "linux")
+try_builder("linux_ubsan_vptr", "linux")
+try_builder("mac", "mac")
+try_builder("mac_asan", "mac")
+try_builder("mac_rel", "mac")
+try_builder("win", "win")
+try_builder("win_clang", "win")
+try_builder("win_clang_rel", "win")
+try_builder("win_rel", "win")
+try_builder("win_x64_clang_rel", "win")
+try_builder("win_x64_rel", "win")
+try_builder("presubmit", "linux")
diff --git a/infra/config/project.cfg b/infra/config/project.cfg
new file mode 100644
index 00000000..3c327118
--- /dev/null
+++ b/infra/config/project.cfg
@@ -0,0 +1,15 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see ProjectCfg message:
+# https://luci-config.appspot.com/schemas/projects:project.cfg
+
+name: "libyuv"
+access: "group:all"
+lucicfg {
+ version: "1.39.14"
+ package_dir: "."
+ config_dir: "."
+ entry_point: "main.star"
+ experiments: "crbug.com/1182002"
+}
diff --git a/infra/config/realms.cfg b/infra/config/realms.cfg
new file mode 100644
index 00000000..16ffaac9
--- /dev/null
+++ b/infra/config/realms.cfg
@@ -0,0 +1,83 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see RealmsCfg message:
+# https://luci-config.appspot.com/schemas/projects:realms.cfg
+
+realms {
+ name: "@root"
+ bindings {
+ role: "role/buildbucket.owner"
+ principals: "group:project-libyuv-admins"
+ }
+ bindings {
+ role: "role/buildbucket.reader"
+ principals: "group:all"
+ }
+ bindings {
+ role: "role/configs.reader"
+ principals: "group:all"
+ }
+ bindings {
+ role: "role/configs.validator"
+ principals: "user:libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+ bindings {
+ role: "role/logdog.reader"
+ principals: "group:all"
+ }
+ bindings {
+ role: "role/logdog.writer"
+ principals: "group:luci-logdog-chromium-writers"
+ }
+ bindings {
+ role: "role/scheduler.owner"
+ principals: "group:project-libyuv-admins"
+ }
+ bindings {
+ role: "role/scheduler.reader"
+ principals: "group:all"
+ }
+ bindings {
+ role: "role/swarming.taskTriggerer"
+ principals: "group:project-libyuv-admins"
+ }
+}
+realms {
+ name: "ci"
+ bindings {
+ role: "role/buildbucket.builderServiceAccount"
+ principals: "user:libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+ bindings {
+ role: "role/scheduler.triggerer"
+ principals: "user:libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+ conditions {
+ restrict {
+ attribute: "scheduler.job.name"
+ values: "Android Tester ARM32 Debug (Nexus 5X)"
+ values: "Android Tester ARM32 Release (Nexus 5X)"
+ values: "Android Tester ARM64 Debug (Nexus 5X)"
+ }
+ }
+ }
+}
+realms {
+ name: "cron"
+ bindings {
+ role: "role/buildbucket.builderServiceAccount"
+ principals: "user:libyuv-ci-autoroll-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+}
+realms {
+ name: "try"
+ bindings {
+ role: "role/buildbucket.builderServiceAccount"
+ principals: "user:libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+ }
+ bindings {
+ role: "role/buildbucket.triggerer"
+ principals: "group:project-libyuv-tryjob-access"
+ principals: "group:service-account-cq"
+ }
+}
diff --git a/files/libyuv.gni b/libyuv.gni
index 8df40ba2..343160c3 100644
--- a/files/libyuv.gni
+++ b/libyuv.gni
@@ -6,13 +6,15 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-import("//build_overrides/build.gni")
import("//build/config/arm.gni")
+import("//build/config/loongarch64.gni")
import("//build/config/mips.gni")
+import("//build_overrides/build.gni")
declare_args() {
libyuv_include_tests = !build_with_chromium
libyuv_disable_jpeg = false
+ libyuv_disable_rvv = false
libyuv_use_neon =
current_cpu == "arm64" ||
(current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
@@ -20,4 +22,8 @@ declare_args() {
(current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
libyuv_use_mmi =
(current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
+ libyuv_use_lsx =
+ (current_cpu == "loong64") && loongarch64_use_lsx
+ libyuv_use_lasx =
+ (current_cpu == "loong64") && loongarch64_use_lasx
}
diff --git a/files/libyuv.gyp b/libyuv.gyp
index f73a1a4b..f73a1a4b 100644
--- a/files/libyuv.gyp
+++ b/libyuv.gyp
diff --git a/files/libyuv.gypi b/libyuv.gypi
index 18b2feca..48936aa7 100644
--- a/files/libyuv.gypi
+++ b/libyuv.gypi
@@ -27,7 +27,9 @@
'include/libyuv/row.h',
'include/libyuv/scale.h',
'include/libyuv/scale_argb.h',
+ 'include/libyuv/scale_rgb.h',
'include/libyuv/scale_row.h',
+ 'include/libyuv/scale_uv.h',
'include/libyuv/version.h',
'include/libyuv/video_common.h',
@@ -35,6 +37,7 @@
'source/compare.cc',
'source/compare_common.cc',
'source/compare_gcc.cc',
+ 'source/compare_msa.cc',
'source/compare_neon.cc',
'source/compare_neon64.cc',
'source/compare_win.cc',
@@ -54,7 +57,6 @@
'source/rotate_argb.cc',
'source/rotate_common.cc',
'source/rotate_gcc.cc',
- 'source/rotate_dspr2.cc',
'source/rotate_msa.cc',
'source/rotate_neon.cc',
'source/rotate_neon64.cc',
@@ -62,7 +64,6 @@
'source/row_any.cc',
'source/row_common.cc',
'source/row_gcc.cc',
- 'source/row_dspr2.cc',
'source/row_msa.cc',
'source/row_neon.cc',
'source/row_neon64.cc',
@@ -72,10 +73,11 @@
'source/scale_argb.cc',
'source/scale_common.cc',
'source/scale_gcc.cc',
- 'source/scale_dspr2.cc',
'source/scale_msa.cc',
'source/scale_neon.cc',
'source/scale_neon64.cc',
+ 'source/scale_rgb.cc',
+ 'source/scale_uv.cc',
'source/scale_win.cc',
'source/video_common.cc',
],
diff --git a/files/linux.mk b/linux.mk
index e9a26a79..d19a888a 100644
--- a/files/linux.mk
+++ b/linux.mk
@@ -13,15 +13,14 @@ LOCAL_OBJ_FILES := \
source/compare.o \
source/compare_common.o \
source/compare_gcc.o \
- source/compare_mmi.o \
source/compare_msa.o \
- source/compare_neon64.o \
source/compare_neon.o \
+ source/compare_neon64.o \
source/compare_win.o \
- source/convert_argb.o \
source/convert.o \
- source/convert_from_argb.o \
+ source/convert_argb.o \
source/convert_from.o \
+ source/convert_from_argb.o \
source/convert_jpeg.o \
source/convert_to_argb.o \
source/convert_to_i420.o \
@@ -29,33 +28,38 @@ LOCAL_OBJ_FILES := \
source/mjpeg_decoder.o \
source/mjpeg_validate.o \
source/planar_functions.o \
+ source/rotate.o \
source/rotate_any.o \
source/rotate_argb.o \
- source/rotate.o \
source/rotate_common.o \
source/rotate_gcc.o \
- source/rotate_mmi.o \
+ source/rotate_lsx.o \
source/rotate_msa.o \
- source/rotate_neon64.o \
source/rotate_neon.o \
+ source/rotate_neon64.o \
source/rotate_win.o \
source/row_any.o \
source/row_common.o \
source/row_gcc.o \
- source/row_mmi.o \
+ source/row_lasx.o \
+ source/row_lsx.o \
source/row_msa.o \
- source/row_neon64.o \
source/row_neon.o \
+ source/row_neon64.o \
+ source/row_rvv.o \
source/row_win.o \
+ source/scale.o \
source/scale_any.o \
source/scale_argb.o \
- source/scale.o \
source/scale_common.o \
source/scale_gcc.o \
- source/scale_mmi.o \
+ source/scale_lsx.o \
source/scale_msa.o \
- source/scale_neon64.o \
source/scale_neon.o \
+ source/scale_neon64.o \
+ source/scale_rgb.o \
+ source/scale_rvv.o \
+ source/scale_uv.o \
source/scale_win.o \
source/video_common.o
@@ -65,7 +69,7 @@ LOCAL_OBJ_FILES := \
.c.o:
$(CC) -c $(CFLAGS) $*.c -o $*.o
-all: libyuv.a yuvconvert cpuid psnr
+all: libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr
libyuv.a: $(LOCAL_OBJ_FILES)
$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
@@ -74,10 +78,18 @@ libyuv.a: $(LOCAL_OBJ_FILES)
yuvconvert: util/yuvconvert.cc libyuv.a
$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a
+# A C test utility that generates yuvconstants for yuv to rgb.
+yuvconstants: util/yuvconstants.c libyuv.a
+ $(CXX) $(CXXFLAGS) -Iutil/ -lm -o $@ util/yuvconstants.c libyuv.a
+
# A standalone test utility
psnr: util/psnr.cc
$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
+# A simple conversion example.
+i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a
+ $(CXX) $(CXXFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a
+
# A C test utility that uses libyuv conversion from C.
# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
@@ -85,4 +97,4 @@ cpuid: util/cpuid.c libyuv.a
$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
clean:
- /bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr
+ /bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr
diff --git a/public.mk b/public.mk
index 259ece21..1342307a 100644
--- a/public.mk
+++ b/public.mk
@@ -3,7 +3,7 @@
# Note that dependencies on NDK are not directly listed since NDK auto adds
# them.
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/files/include
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
LIBYUV_C_FLAGS :=
diff --git a/files/pylintrc b/pylintrc
index b8bea334..b8bea334 100644
--- a/files/pylintrc
+++ b/pylintrc
diff --git a/riscv_script/prepare_toolchain_qemu.sh b/riscv_script/prepare_toolchain_qemu.sh
new file mode 100755
index 00000000..2a901739
--- /dev/null
+++ b/riscv_script/prepare_toolchain_qemu.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -ev
+
+# Download & build RISC-V Clang toolchain & QEMU emulator.
+# RISC-V Clang is for cross compile with the RISC-V Vector ISA.
+# RISC-V QEMU is used to run the test suite.
+#
+# Requirements: Linux host w/ working C++ compiler, git, cmake, ninja, wget, tar
+
+# NOTE: this script must be run from the top-level directory of the LIBYUV_SRC_DIR.
+
+RISCV_TRIPLE="riscv64-unknown-linux-gnu"
+RISCV_QEMU="qemu-riscv64"
+
+LIBYUV_SRC_DIR=$(pwd)
+BUILD_DIR="$LIBYUV_SRC_DIR"/build-toolchain-qemu
+INSTALL_QEMU="$BUILD_DIR"/riscv-qemu
+INSTALL_CLANG="$BUILD_DIR"/riscv-clang
+
+LLVM_VERSION="16.0.0"
+LLVM_NAME=llvm-project-"$LLVM_VERSION".src
+
+RISCV_GNU_TOOLCHAIN="$BUILD_DIR"/riscv-gnu-toolchain
+RISCV_CLANG_TOOLCHAIN="$BUILD_DIR"/"$LLVM_NAME"
+
+QEMU_NAME="qemu-7.0.0"
+
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+# Download and install RISC-V GNU Toolchain (needed to build Clang)
+if [ ! -d "$RISCV_GNU_TOOLCHAIN" ]
+then
+ git clone git@github.com:riscv/riscv-gnu-toolchain.git
+ pushd "$RISCV_GNU_TOOLCHAIN"
+ git submodule update --init --recursive
+ ./configure --with-cmodel=medany --prefix="$INSTALL_CLANG"
+ ionice nice make linux -j `nproc` install
+ popd
+fi
+
+# Download Clang toolchain & build cross compiler
+if [ ! -d "$RISCV_CLANG_TOOLCHAIN" ]
+then
+ wget https://github.com/llvm/llvm-project/releases/download/llvmorg-"$LLVM_VERSION"/"$LLVM_NAME".tar.xz
+ tar xvJf "$LLVM_NAME".tar.xz
+ pushd "$RISCV_CLANG_TOOLCHAIN"
+ cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_CLANG" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DLLVM_TARGETS_TO_BUILD="RISCV" \
+ -DLLVM_ENABLE_PROJECTS="clang" \
+ -DLLVM_DEFAULT_TARGET_TRIPLE="$RISCV_TRIPLE" \
+ -DLLVM_INSTALL_TOOLCHAIN_ONLY=On \
+ -DDEFAULT_SYSROOT=../sysroot \
+ -G "Ninja" "$RISCV_CLANG_TOOLCHAIN"/llvm
+ ionice nice ninja -j `nproc`
+ ionice nice ninja -j `nproc` install
+ popd
+ pushd "$INSTALL_CLANG"/bin
+ ln -sf clang "$RISCV_TRIPLE"-clang
+ ln -sf clang++ "$RISCV_TRIPLE"-clang++
+ popd
+fi
+
+# Download QEMU and build the riscv64 Linux usermode emulator
+if [ ! -d "$QEMU_NAME" ]
+then
+ wget https://download.qemu.org/"$QEMU_NAME".tar.xz
+ tar xvJf "$QEMU_NAME".tar.xz
+ pushd "$QEMU_NAME"
+ ./configure --target-list=riscv64-linux-user --prefix="$INSTALL_QEMU"
+ ionice nice make -j `nproc` install
+ popd
+fi
diff --git a/riscv_script/riscv-clang.cmake b/riscv_script/riscv-clang.cmake
new file mode 100644
index 00000000..e287941f
--- /dev/null
+++ b/riscv_script/riscv-clang.cmake
@@ -0,0 +1,55 @@
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "riscv64")
+
+option(USE_RVV "Enable riscv vector or not." ON)
+option(USE_AUTO_VECTORIZER "Enable riscv auto vectorizer or not." OFF)
+
+# Avoid to use system path for cross-compile
+set(CMAKE_FIND_USE_CMAKE_SYSTEM_PATH FALSE)
+
+set(TOOLCHAIN_PATH "" CACHE STRING "The toolcahin path.")
+if(NOT TOOLCHAIN_PATH)
+ set(TOOLCHAIN_PATH ${CMAKE_SOURCE_DIR}/build-toolchain-qemu/riscv-clang)
+endif()
+
+set(TOOLCHAIN_PREFIX "riscv64-unknown-linux-gnu-" CACHE STRING "The toolcahin prefix.")
+
+# toolchain setting
+set(CMAKE_C_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang")
+set(CMAKE_CXX_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang++")
+
+# CMake will just use the host-side tools for the following tools, so we setup them here.
+set(CMAKE_C_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_CXX_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_C_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_OBJDUMP "${TOOLCHAIN_PATH}/bin/llvm-objdump")
+set(CMAKE_OBJCOPY "${TOOLCHAIN_PATH}/bin/llvm-objcopy")
+
+# compile options
+set(RISCV_COMPILER_FLAGS "" CACHE STRING "Compile flags")
+# if user provides RISCV_COMPILER_FLAGS, appeding compile flags is avoided.
+if(RISCV_COMPILER_FLAGS STREQUAL "")
+ message(STATUS "USE_RVV: ${USE_RVV}")
+ message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}")
+ if(USE_RVV)
+ list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv")
+ if(NOT USE_AUTO_VECTORIZER)
+ # Disable auto-vectorizer
+ add_compile_options(-fno-vectorize -fno-slp-vectorize)
+ endif()
+ else()
+ list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc")
+ endif()
+endif()
+message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}")
+
+set(CMAKE_C_FLAGS "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${RISCV_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}")
+
+set(RISCV_LINKER_FLAGS "-lstdc++ -lpthread -lm -ldl")
+set(RISCV_LINKER_FLAGS_EXE)
+set(CMAKE_SHARED_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+set(CMAKE_MODULE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${RISCV_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
diff --git a/riscv_script/run_qemu.sh b/riscv_script/run_qemu.sh
new file mode 100755
index 00000000..080af3b1
--- /dev/null
+++ b/riscv_script/run_qemu.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+set -e
+
+USE_RVV="${USE_RVV:-OFF}"
+TOOLCHAIN_PATH="${TOOLCHAIN_PATH:-../../build-toolchain-qemu/riscv-clang}"
+QEMU_PREFIX_PATH="${QEMU_PREFIX_PATH:-../../build-toolchain-qemu/riscv-qemu/}"
+
+if [ "${USE_RVV}" = "ON" ];then
+ QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0 -L ${TOOLCHAIN_PATH}/sysroot"
+else
+ QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true -L ${TOOLCHAIN_PATH}/sysroot"
+fi
+
+$QEMU_PREFIX_PATH/bin/qemu-riscv64 $QEMU_OPTION $@
diff --git a/files/source/compare.cc b/source/compare.cc
index 5aa3a4db..50a736bd 100644
--- a/files/source/compare.cc
+++ b/source/compare.cc
@@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
}
#endif
- while (count >= (uint64_t)(kBlockSize)) {
+ while (count >= (uint64_t)kBlockSize) {
seed = HashDjb2_SSE(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
@@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
- if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
return FOURCC_BGRA;
}
- if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
+ if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255.
return FOURCC_ARGB;
}
argb += 8;
@@ -154,11 +154,6 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
HammingDistance = HammingDistance_MSA;
}
#endif
-#if defined(HAS_HAMMINGDISTANCE_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- HammingDistance = HammingDistance_MMI;
- }
-#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
@@ -216,11 +211,6 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
SumSquareError = SumSquareError_MSA;
}
#endif
-#if defined(HAS_SUMSQUAREERROR_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SumSquareError = SumSquareError_MMI;
- }
-#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : sse)
#endif
@@ -369,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a,
(sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
- if (ssim_d == 0.0) {
+ if (ssim_d == 0) {
return DBL_MAX;
}
- return ssim_n * 1.0 / ssim_d;
+ return (double)ssim_n / (double)ssim_d;
}
}
diff --git a/files/source/compare_common.cc b/source/compare_common.cc
index d4b170ad..d1cab8d2 100644
--- a/files/source/compare_common.cc
+++ b/source/compare_common.cc
@@ -17,36 +17,6 @@ namespace libyuv {
extern "C" {
#endif
-#if ORIGINAL_OPT
-uint32_t HammingDistance_C1(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff = 0u;
-
- int i;
- for (i = 0; i < count; ++i) {
- int x = src_a[i] ^ src_b[i];
- if (x & 1)
- ++diff;
- if (x & 2)
- ++diff;
- if (x & 4)
- ++diff;
- if (x & 8)
- ++diff;
- if (x & 16)
- ++diff;
- if (x & 32)
- ++diff;
- if (x & 64)
- ++diff;
- if (x & 128)
- ++diff;
- }
- return diff;
-}
-#endif
-
// Hakmem method for hamming distance.
uint32_t HammingDistance_C(const uint8_t* src_a,
const uint8_t* src_b,
diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
new file mode 100644
index 00000000..33cbe25d
--- /dev/null
+++ b/source/compare_gcc.cc
@@ -0,0 +1,359 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint64_t diff = 0u;
+
+ asm volatile(
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
+
+ // Process 32 bytes per loop.
+ LABELALIGN
+ "1: \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ :
+ : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+ return (uint32_t)(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ // Process 16 bytes per loop.
+ LABELALIGN
+ "1: \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "+r"(diff) // %3
+ :
+ : "memory", "cc", "ecx", "edx");
+
+ return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ : "m"(kNibbleMask), // %4
+ "m"(kBitCount) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+
+ return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ "vbroadcastf128 %4,%%ymm2 \n"
+ "vbroadcastf128 %5,%%ymm3 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
+ "vzeroupper \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ : "m"(kNibbleMask), // %4
+ "m"(kBitCount) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+ return diff;
+}
+#endif // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+ return sse;
+}
+
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
+static const uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+ uint32_t hash;
+ asm volatile(
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/compare_msa.cc b/source/compare_msa.cc
index 0b807d37..0b807d37 100644
--- a/files/source/compare_msa.cc
+++ b/source/compare_msa.cc
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
new file mode 100644
index 00000000..afdd6012
--- /dev/null
+++ b/source/compare_neon.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
+
+ asm volatile(
+ "vmov.u16 q4, #0 \n" // accumulator
+
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
+
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
+
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "q0", "q1", "q2", "q3", "q4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
new file mode 100644
index 00000000..70fb9b91
--- /dev/null
+++ b/source/compare_neon64.cc
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
+ asm volatile(
+ "movi v4.8h, #0 \n"
+
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
+
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/compare_win.cc b/source/compare_win.cc
index d57d3d9d..9bb27f1d 100644
--- a/files/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -22,8 +22,9 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ !defined(__clang__) && defined(_M_IX86)
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
const uint8_t* src_b,
@@ -77,8 +78,7 @@ __declspec(naked) uint32_t
}
}
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_SUMSQUAREERROR_AVX2
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable : 4752)
__declspec(naked) uint32_t
@@ -118,7 +118,7 @@ __declspec(naked) uint32_t
ret
}
}
-#endif // _MSC_VER >= 1700
+#endif // HAS_SUMSQUAREERROR_AVX2
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
@@ -196,7 +196,7 @@ __declspec(naked) uint32_t
}
// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_HASHDJB2_AVX2
__declspec(naked) uint32_t
HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
__asm {
@@ -231,7 +231,7 @@ __declspec(naked) uint32_t
ret
}
}
-#endif // _MSC_VER >= 1700
+#endif // HAS_HASHDJB2_AVX2
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
diff --git a/source/convert.cc b/source/convert.cc
new file mode 100644
index 00000000..6ac5bc43
--- /dev/null
+++ b/source/convert.cc
@@ -0,0 +1,4055 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/scale_row.h" // For FixedDiv
+#include "libyuv/scale_uv.h" // For UVScale()
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsample amount uses a shift.
+// v is value
+// a is amount to add to round up
+// s is shift to subsample down
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int src_uv_width,
+ int src_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ int r;
+ if (src_uv_width <= 0 || src_uv_height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ r = ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return r;
+}
+
+// Copy I420 with optional flipping.
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// Copy I010 with optional flipping.
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+static int Planar16bitTo8bit(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y,
+ int depth) {
+ int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ int scale = 1 << (24 - depth);
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ uv_height = -uv_height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (uv_height - 1) * src_stride_u;
+ src_v = src_v + (uv_height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+ height);
+ // Convert UV planes.
+ Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width,
+ uv_height);
+ Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width,
+ uv_height);
+ return 0;
+}
+
+static int I41xToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int depth) {
+ const int scale = 1 << (24 - depth);
+
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ {
+ const int uv_width = SUBSAMPLE(width, 1, 1);
+ const int uv_height = SUBSAMPLE(height, 1, 1);
+
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+ height);
+ ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u,
+ dst_stride_u, src_u, dst_u, scale, kFilterBilinear);
+ ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v,
+ dst_stride_v, src_v, dst_v, scale, kFilterBilinear);
+ }
+ return 0;
+}
+
+static int I21xToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int depth) {
+ const int scale = 1 << (24 - depth);
+
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ {
+ const int uv_width = SUBSAMPLE(width, 1, 1);
+ const int uv_height = SUBSAMPLE(height, 1, 1);
+ const int dy = FixedDiv(height, uv_height);
+
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+ height);
+ ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
+ dst_stride_u, src_u, dst_u, 0, 32768, dy,
+ /*bpp=*/1, scale, kFilterBilinear);
+ ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
+ dst_stride_v, src_v, dst_v, 0, 32768, dy,
+ /*bpp=*/1, scale, kFilterBilinear);
+ }
+ return 0;
+}
+
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 1, 10);
+}
+
+LIBYUV_API
+int I210ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 10);
+}
+
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 0, 10);
+}
+
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 10);
+}
+
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+ 0, 10);
+}
+
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 1, 12);
+}
+
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 0, 12);
+}
+
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 12);
+}
+
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+ 0, 12);
+}
+
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 12);
+}
+
+// Any Ix10 To I010 format with mirroring.
+static int Ix10ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y) {
+ const int dst_y_width = Abs(width);
+ const int dst_y_height = Abs(height);
+ const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ int r;
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ dst_y_width, dst_y_height, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return r;
+}
+
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 0, 0);
+}
+
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 1, 0);
+}
+
+// Any I[420]1[02] to P[420]1[02] format with mirroring.
+static int IxxxToPxxx(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y,
+ int depth) {
+ const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+ depth);
+ MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, uv_width, uv_height, depth);
+ return 0;
+}
+
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 0, 10);
+}
+
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 1, 12);
+}
+
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 0, 12);
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int src_uv_width = SUBSAMPLE(width, 1, 1);
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, src_uv_width, height);
+}
+
+LIBYUV_API
+int I422ToI210(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+ height);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+ height);
+ return 0;
+}
+
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Allocate u and v buffers
+ align_buffer_64(plane_u, halfwidth * halfheight * 2);
+ uint8_t* plane_v = plane_u + halfwidth * halfheight;
+ if (!plane_u)
+ return 1;
+
+ I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+ height);
+ MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+ halfwidth, halfheight);
+ free_aligned_buffer_64(plane_u);
+ return 0;
+}
+
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_uv || !dst_uv || width <= 0) {
+ return -1;
+ }
+
+ int sign = height < 0 ? -1 : 1;
+
+ if (dst_y) {
+ DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+ }
+ DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1,
+ (height + sign) / 2, 16);
+
+ return 0;
+}
+
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int sign = height < 0 ? -1 : 1;
+
+ if (!src_uv || !dst_u || !dst_v || width <= 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+ }
+ DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16);
+
+ return 0;
+}
+
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ if (!src_y || !src_uv || !dst_yuy2 || width <= 0) {
+ return -1;
+ }
+
+ DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2,
+ dst_stride_yuy2, width, height, 32);
+
+ return 0;
+}
+
+// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format
+// documentation.
+// TODO(greenjustin): Add an MT2T to I420 conversion.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (width <= 0 || !height || !src_uv || !dst_uv) {
+ return -1;
+ }
+
+ {
+ int uv_width = (width + 1) & ~1;
+ int uv_height = (height + 1) / 2;
+ int y = 0;
+ const int tile_width = 16;
+ const int y_tile_height = 32;
+ const int uv_tile_height = 16;
+ int padded_width = (width + tile_width - 1) & ~(tile_width - 1);
+ int y_tile_row_size = padded_width * y_tile_height * 10 / 8;
+ int uv_tile_row_size = padded_width * uv_tile_height * 10 / 8;
+ size_t row_buf_size = padded_width * y_tile_height * sizeof(uint16_t);
+ void (*UnpackMT2T)(const uint8_t* src, uint16_t* dst, size_t size) =
+ UnpackMT2T_C;
+ align_buffer_64(row_buf, row_buf_size);
+ if (!row_buf)
+ return 1;
+
+#if defined(HAS_UNPACKMT2T_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UnpackMT2T = UnpackMT2T_NEON;
+ }
+#endif
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ uv_height = (height + 1) / 2;
+ if (dst_y) {
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv;
+ dst_stride_uv = -dst_stride_uv;
+ }
+
+ // Unpack and detile Y in rows of tiles
+ if (src_y && dst_y) {
+ for (y = 0; y < (height & ~(y_tile_height - 1)); y += y_tile_height) {
+ UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+ DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+ width, y_tile_height, y_tile_height);
+ src_y += src_stride_y * y_tile_height;
+ dst_y += dst_stride_y * y_tile_height;
+ }
+ if (height & (y_tile_height - 1)) {
+ UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+ DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+ width, height & (y_tile_height - 1), y_tile_height);
+ }
+ }
+
+ // Unpack and detile UV plane
+ for (y = 0; y < (uv_height & ~(uv_tile_height - 1)); y += uv_tile_height) {
+ UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+ DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+ uv_width, uv_tile_height, uv_tile_height);
+ src_uv += src_stride_uv * uv_tile_height;
+ dst_uv += dst_stride_uv * uv_tile_height;
+ }
+ if (uv_height & (uv_tile_height - 1)) {
+ UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+ DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+ uv_width, uv_height & (uv_tile_height - 1),
+ uv_tile_height);
+ }
+ free_aligned_buffer_64(row_buf);
+ }
+ return 0;
+}
+
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow = MergeUVRow_RVV;
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+ }
+ {
+ // Allocate 2 rows of vu.
+ int awidth = halfwidth * 2;
+ align_buffer_64(row_vu_0, awidth * 2);
+ uint8_t* row_vu_1 = row_vu_0 + awidth;
+ if (!row_vu_0)
+ return 1;
+
+ for (y = 0; y < height - 1; y += 2) {
+ MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+ MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+ halfwidth);
+ InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+ }
+ free_aligned_buffer_64(row_vu_0);
+ }
+ return 0;
+}
+#endif // I422TONV21_ROW_VERSION
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, width, height);
+}
+
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+ SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+ return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+ return 0;
+}
+
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Split UV plane - NV12 / NV21
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+
+ return 0;
+}
+
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ width, height);
+}
+
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int r;
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+ Abs(height), kFilterBilinear);
+ return r;
+}
+
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int r;
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+ dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+ return r;
+}
+
+// Any P[420]1[02] to I[420]1[02] format with mirroring.
+static int PxxxToIxxx(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y,
+ int depth) {
+ const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+ ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+ depth);
+ SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, uv_width, uv_height, depth);
+ return 0;
+}
+
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ width, height, 1, 1, 12);
+}
+
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int r;
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+ Abs(height), kFilterBilinear);
+ return r;
+}
+
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int r;
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+ dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+ return r;
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+ YUY2ToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUVRow = YUY2ToUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LSX;
+ YUY2ToUVRow = YUY2ToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_LSX;
+ YUY2ToUVRow = YUY2ToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
+ YUY2ToUVRow = YUY2ToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_LASX;
+ YUY2ToUVRow = YUY2ToUVRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+ UYVYToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUVRow = UYVYToUVRow_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUVRow = UYVYToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUVRow = UYVYToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ UYVYToYRow = UYVYToYRow_Any_LSX;
+ UYVYToUVRow = UYVYToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_LSX;
+ UYVYToUVRow = UYVYToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ UYVYToYRow = UYVYToYRow_Any_LSX;
+ UYVYToUVRow = UYVYToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_LSX;
+ UYVYToUVRow = UYVYToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ UYVYToYRow = UYVYToYRow_Any_LASX;
+ UYVYToUVRow = UYVYToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_LASX;
+ UYVYToUVRow = UYVYToUVRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToUVRow = AYUVToUVRow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToUVRow = AYUVToUVRow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToUVRow = AYUVToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToUVRow = AYUVToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_vu, int width) = AYUVToVURow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToVURow = AYUVToVURow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToVURow = AYUVToVURow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToVURow = AYUVToVURow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToVURow = AYUVToVURow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToVURow = AYUVToVURow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToVURow = AYUVToVURow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ AYUVToVURow(src_ayuv, 0, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ ARGBToUVRow = ARGBToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ ARGBToUVRow = ARGBToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+#ifdef USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+// The following version calls ARGBExtractAlpha on the full image.
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
+ int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height);
+ if (r == 0) {
+ r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width,
+ height);
+ }
+ return r;
+}
+#else // USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+ int width) = ARGBExtractAlphaRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+ : ARGBExtractAlphaRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+ : ARGBExtractAlphaRow_Any_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+ : ARGBExtractAlphaRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+ : ARGBExtractAlphaRow_Any_MSA;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+ : ARGBExtractAlphaRow_Any_LSX;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ ARGBExtractAlphaRow(src_argb, dst_a, width);
+ ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a,
+ width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ dst_a += dst_stride_a * 2;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBExtractAlphaRow(src_argb, dst_a, width);
+ }
+ return 0;
+}
+#endif // USE_EXTRACTALPHA
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
+ BGRAToYRow_C;
+ if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+#if defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToYRow = BGRAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BGRAToYRow = BGRAToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ BGRAToYRow = BGRAToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BGRAToUVRow = BGRAToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ BGRAToUVRow = BGRAToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToYRow = BGRAToYRow_Any_MSA;
+ BGRAToUVRow = BGRAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ BGRAToUVRow = BGRAToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_LSX) && defined(HAS_BGRATOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ BGRAToYRow = BGRAToYRow_Any_LSX;
+ BGRAToUVRow = BGRAToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_LSX;
+ BGRAToUVRow = BGRAToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ BGRAToYRow = BGRAToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ BGRAToYRow = BGRAToYRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ BGRAToYRow = BGRAToYRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+ src_bgra += src_stride_bgra * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX) && defined(HAS_ABGRTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYRow = ABGRToYRow_Any_LSX;
+ ABGRToUVRow = ABGRToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_LSX;
+ ABGRToUVRow = ABGRToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ABGRToYRow = ABGRToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYRow = ABGRToYRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ABGRToYRow = ABGRToYRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
+ RGBAToYRow_C;
+ if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+#if defined(HAS_RGBATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYRow = RGBAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYRow = RGBAToYRow_Any_MSA;
+ RGBAToUVRow = RGBAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
+ RGBAToUVRow = RGBAToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_LSX) && defined(HAS_RGBATOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGBAToYRow = RGBAToYRow_Any_LSX;
+ RGBAToUVRow = RGBAToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_LSX;
+ RGBAToUVRow = RGBAToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_LASX)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYRow = RGBAToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGBAToYRow = RGBAToYRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGBAToYRow = RGBAToYRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+ src_rgba += src_stride_rgba * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ }
+ return 0;
+}
+
+// Enabled if 1 pass is available
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV))
+#define HAS_RGB24TOYROW
+#endif
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_RGB24TOYROW)
+ void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+ RGB24ToYRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+#if defined(HAS_RGB24TOYROW)
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ RGB24ToYRow = RGB24ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_NEON;
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+ RGB24ToYRow = RGB24ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_MSA;
+ RGB24ToUVRow = RGB24ToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_LSX) && defined(HAS_RGB24TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_LSX;
+ RGB24ToYRow = RGB24ToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_LSX;
+ RGB24ToUVRow = RGB24ToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_LASX;
+ RGB24ToYRow = RGB24ToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYRow = RGB24ToYRow_LASX;
+ RGB24ToUVRow = RGB24ToUVRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGB24ToYRow = RGB24ToYRow_RVV;
+ }
+#endif
+
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else // HAS_RGB24TOYROW
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RGB24TOYROW
+
+ {
+#if !defined(HAS_RGB24TOYROW)
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW)
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+ ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB24TOYROW)
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB24TOYROW)
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+#undef HAS_RGB24TOYROW
+
+// Enabled if 1 pass is available
+#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_RVV)
+#define HAS_RGB24TOYJROW
+#endif
+
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_RGB24TOYJROW)
+ void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVJRow_C;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+ RGB24ToYJRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+#if defined(HAS_RGB24TOYJROW)
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGB24ToYJRow = RGB24ToYJRow_RVV;
+ }
+#endif
+
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else // HAS_RGB24TOYJROW
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RGB24TOYJROW
+
+ {
+#if !defined(HAS_RGB24TOYJROW)
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYJROW)
+ RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+ RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+ ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB24TOYJROW)
+ RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB24TOYJROW)
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+#undef HAS_RGB24TOYJROW
+
+// Enabled if 1 pass is available
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV))
+#define HAS_RAWTOYROW
+#endif
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_RAWTOYROW)
+ void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ RAWToYRow = RAWToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_NEON;
+ RAWToUVRow = RAWToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVRow = RAWToUVRow_Any_MSA;
+ RAWToYRow = RAWToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_MSA;
+ RAWToUVRow = RAWToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_LSX) && defined(HAS_RAWTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToUVRow = RAWToUVRow_Any_LSX;
+ RAWToYRow = RAWToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_LSX;
+ RAWToUVRow = RAWToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToUVRow = RAWToUVRow_Any_LASX;
+ RAWToYRow = RAWToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYRow = RAWToYRow_LASX;
+ RAWToUVRow = RAWToUVRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToYRow = RAWToYRow_RVV;
+ }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else // HAS_RAWTOYROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RAWTOYROW
+
+ {
+#if !defined(HAS_RAWTOYROW)
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW)
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+ ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYROW)
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RAWTOYROW)
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+#undef HAS_RAWTOYROW
+
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+ defined(HAS_RAWTOYJROW_RVV)
+#define HAS_RAWTOYJROW
+#endif
+
+// Convert RAW to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_RAWTOYJROW)
+ void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RAWToUVJRow_C;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYJRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVJRow = RAWToUVJRow_Any_NEON;
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ RAWToUVJRow = RAWToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVJRow = RAWToUVJRow_Any_MSA;
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ RAWToUVJRow = RAWToUVJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToYJRow = RAWToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToYJRow = RAWToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYJRow = RAWToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToYJRow = RAWToYJRow_RVV;
+ }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else // HAS_RAWTOYJROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RAWTOYJROW
+
+ {
+#if !defined(HAS_RAWTOYJROW)
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYJRow(src_raw, dst_y, width);
+ RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+ ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYJRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RAWTOYJROW)
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+#undef HAS_RAWTOYJROW
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+ void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB565ToUVRow_C;
+ void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
+ RGB565ToYRow_C;
+#else
+ void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = RGB565ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ RGB565ToYRow = RGB565ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
+ }
+ }
+ }
+// MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \
+ defined(HAS_RGB565TOYROW_LASX))
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+ RGB565ToYRow = RGB565ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_MSA;
+ RGB565ToUVRow = RGB565ToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOYROW_LSX) && defined(HAS_RGB565TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_LSX;
+ RGB565ToYRow = RGB565ToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_LSX;
+ RGB565ToUVRow = RGB565ToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOYROW_LASX) && defined(HAS_RGB565TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_LASX;
+ RGB565ToYRow = RGB565ToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB565ToYRow = RGB565ToYRow_LASX;
+ RGB565ToUVRow = RGB565ToUVRow_LASX;
+ }
+ }
+#endif
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#endif
+ {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width);
+ ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+ void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGB1555ToUVRow_C;
+ void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+ int width) = ARGB1555ToYRow_C;
+#else
+ void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+ }
+ }
+ }
+// MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \
+ defined(HAS_ARGB1555TOYROW_LASX))
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LSX;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_LSX;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOYROW_LASX) && defined(HAS_ARGB1555TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LASX;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_LASX;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_LASX;
+ }
+ }
+#endif
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#endif
+ {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + row_size,
+ width);
+ ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGB4444ToUVRow_C;
+ void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+ int width) = ARGB4444ToYRow_C;
+#else
+ void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+ }
+ }
+ }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ ARGBToUVRow = ARGBToUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ ARGBToUVRow = ARGBToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_ARGB4444TOYROW_NEON))
+ // Allocate 2 rows of ARGB.
+ const int row_size = (width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + row_size,
+ width);
+ ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_ARGB4444TOYROW_NEON))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+ RGB24ToYJRow_C;
+ if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGB24ToYJRow = RGB24ToYJRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+ RAWToYJRow_C;
+ if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_yj = 0;
+ }
+
+#if defined(HAS_RAWTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToYJRow = RAWToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RAWToYJRow = RAWToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYJRow = RAWToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToYJRow = RAWToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToYJRow = RAWToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYJRow = RAWToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToYJRow = RAWToYJRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToYJRow(src_raw, dst_yj, width);
+ src_raw += src_stride_raw;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Android420ToI420Rotate(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_pixel_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height, kRotate0);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
new file mode 100644
index 00000000..871fea59
--- /dev/null
+++ b/source/convert_argb.cc
@@ -0,0 +1,8556 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include <assert.h>
+
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h" // For ScaleRowUp2_Linear and ScaleRowUp2_Bilinear
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+ height);
+ return 0;
+}
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToARGBRow = I422ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToARGBRow = I422ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I444ToARGBRow = I444ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToARGBRow = I444ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToRGB24Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+ dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToRGB24Row = I444ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToRGB24Row = I444ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToRGB24Row = I444ToRGB24Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToRGB24Row = I444ToRGB24Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToRGB24Row = I444ToRGB24Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I444ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I444ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert 12 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I212ToAR30Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I212TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I212ToAR30Row = I212ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I212TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I212ToAR30Row = I212ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I212ToAR30Row = I212ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToAR30Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToAR30Row = I410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToAR30Row = I410ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToARGBRow = I210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I212ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I212TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I212ToARGBRow = I212ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I212TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I212ToARGBRow = I212ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I212ToARGBRow = I212ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToARGBRow = I210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToARGBRow = I410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToARGBRow = I410ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToARGBRow = P210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToARGBRow = P210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToARGBRow = P210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToARGBRow = P210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToAR30Row = P210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToAR30Row = P210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToAR30Row = P210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToAR30Row = P210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+ return 0;
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I422 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I422AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I444AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I422 with Alpha to ARGB.
+LIBYUV_API
+int I422AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I422 with Alpha to ABGR.
+LIBYUV_API
+int I422AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I422AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I444 with Alpha to ARGB.
+LIBYUV_API
+int I444AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I444 with Alpha to ABGR.
+LIBYUV_API
+int I444AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I444AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I210AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I210AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I410AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB with matrix.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I400ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I400ToARGBRow = I400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I400ToARGBRow = I400ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I400ToARGBRow = I400ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
+ J400ToARGBRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_J400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J400ToARGBRow = J400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ J400ToARGBRow = J400ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ J400ToARGBRow = J400ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ J400ToARGBRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifndef __riscv
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+
+// Shuffle table for converting AR64 to AB64.
+static const uvec8 kShuffleMaskAR64ToAB64 = {
+ 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+ (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height);
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height) {
+ return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64,
+ (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height);
+}
+#else
+// Convert BGRA to ARGB (same as ARGBToBGRA).
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBToBGRA(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, width,
+ height);
+}
+
+// Convert ARGB to BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToBGRARow)(const uint8_t* src_argb, uint8_t* dst_bgra, int width) =
+ ARGBToBGRARow_C;
+ if (!src_argb || !dst_bgra || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_bgra == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_bgra = 0;
+ }
+
+#if defined(HAS_ARGBTOBGRAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToBGRARow = ARGBToBGRARow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToBGRARow(src_argb, dst_bgra, width);
+ src_argb += src_stride_argb;
+ dst_bgra += dst_stride_bgra;
+ }
+ return 0;
+}
+
+// Convert ARGB to ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToABGRRow)(const uint8_t* src_argb, uint8_t* dst_abgr, int width) =
+ ARGBToABGRRow_C;
+ if (!src_argb || !dst_abgr || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_abgr == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_abgr = 0;
+ }
+
+#if defined(HAS_ARGBTOABGRROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToABGRRow = ARGBToABGRRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToABGRRow(src_argb, dst_abgr, width);
+ src_argb += src_stride_argb;
+ dst_abgr += dst_stride_abgr;
+ }
+ return 0;
+}
+
+// Convert ABGR to ARGB (same as ARGBToABGR).
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBToABGR(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, width,
+ height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToARGBRow)(const uint8_t* src_rgba, uint8_t* dst_argb, int width) =
+ RGBAToARGBRow_C;
+ if (!src_rgba || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_rgba == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgba = dst_stride_argb = 0;
+ }
+
+#if defined(HAS_RGBATOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGBAToARGBRow = RGBAToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGBAToARGBRow(src_rgba, dst_argb, width);
+ src_rgba += src_stride_rgba;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height) {
+ int y;
+ void (*AR64ToAB64Row)(const uint16_t* src_ar64, uint16_t* dst_ab64,
+ int width) = AR64ToAB64Row_C;
+ if (!src_ar64 || !dst_ab64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+ src_stride_ar64 = -src_stride_ar64;
+ }
+ // Coalesce rows.
+ if (src_stride_ar64 == width * 4 && dst_stride_ab64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar64 = dst_stride_ab64 = 0;
+ }
+
+#if defined(HAS_AR64TOAB64ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ AR64ToAB64Row = AR64ToAB64Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AR64ToAB64Row(src_ar64, dst_ab64, width);
+ src_ar64 += src_stride_ar64;
+ dst_ab64 += dst_stride_ab64;
+ }
+ return 0;
+}
+#endif
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToARGBRow(src_rgb24, dst_argb, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ if (!src_raw || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_argb = 0;
+ }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToARGBRow = RAWToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToARGBRow = RAWToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToARGBRow = RAWToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToARGBRow = RAWToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+ RAWToRGBARow_C;
+ if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgba = 0;
+ }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGBARow = RAWToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGBARow = RAWToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGBARow = RAWToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGBAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToRGBARow = RAWToRGBARow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGBARow(src_raw, dst_rgba, width);
+ src_raw += src_stride_raw;
+ dst_rgba += dst_stride_rgba;
+ }
+ return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+ int width) = RGB565ToARGBRow_C;
+ if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb565 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB565ToARGBRow(src_rgb565, dst_argb, width);
+ src_rgb565 += src_stride_rgb565;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
+ if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ // Coalesce rows.
+ if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb1555 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+ src_argb1555 += src_stride_argb1555;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
+ if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ // Coalesce rows.
+ if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb4444 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+ src_argb4444 += src_stride_argb4444;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AR30 to ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_argb = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToARGBRow_C(src_ar30, dst_argb, width);
+ src_ar30 += src_stride_ar30;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_abgr = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+ src_ar30 += src_stride_ar30;
+ dst_abgr += dst_stride_abgr;
+ }
+ return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_ab30 = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+ src_ar30 += src_stride_ar30;
+ dst_ab30 += dst_stride_ab30;
+ }
+ return 0;
+}
+
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+ int width) = AR64ToARGBRow_C;
+ if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+ src_stride_ar64 = -src_stride_ar64;
+ }
+ // Coalesce rows.
+ if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar64 = dst_stride_argb = 0;
+ }
+#if defined(HAS_AR64TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ AR64ToARGBRow = AR64ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_AR64TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AR64ToARGBRow = AR64ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ AR64ToARGBRow = AR64ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_AR64TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AR64ToARGBRow = AR64ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ AR64ToARGBRow = AR64ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_AR64TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ AR64ToARGBRow = AR64ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AR64ToARGBRow(src_ar64, dst_argb, width);
+ src_ar64 += src_stride_ar64;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+ int src_stride_ab64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+ int width) = AB64ToARGBRow_C;
+ if (!src_ab64 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ab64 = src_ab64 + (height - 1) * src_stride_ab64;
+ src_stride_ab64 = -src_stride_ab64;
+ }
+ // Coalesce rows.
+ if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ab64 = dst_stride_argb = 0;
+ }
+#if defined(HAS_AB64TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ AB64ToARGBRow = AB64ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_AB64TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AB64ToARGBRow = AB64ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ AB64ToARGBRow = AB64ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_AB64TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AB64ToARGBRow = AB64ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ AB64ToARGBRow = AB64ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_AB64TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ AB64ToARGBRow = AB64ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AB64ToARGBRow(src_ab64, dst_argb, width);
+ src_ab64 += src_stride_ab64;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ NV12ToARGBRow = NV12ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ NV21ToARGBRow = NV21ToARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+ dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+ dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB24Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToRGB24Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+ dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+ uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+ if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+ dst_stride_yuv24 = -dst_stride_yuv24;
+ }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+ dst_yuv24 += dst_stride_yuv24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
+ YUY2ToARGBRow_C;
+ if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_argb = 0;
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_LSX;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
+ UYVYToARGBRow_C;
+ if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_argb = 0;
+ }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToARGBRow = UYVYToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_LSX;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+ src_uyvy += src_stride_uyvy;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+static void WeavePixels(const uint8_t* src_u,
+ const uint8_t* src_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_uv,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_uv[0] = *src_u;
+ dst_uv[1] = *src_v;
+ dst_uv += 2;
+ src_u += src_pixel_stride_uv;
+ src_v += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ uint8_t* dst_uv;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+
+ // I420
+ if (src_pixel_stride_uv == 1) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ // NV21
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+ dst_stride_argb, yuvconstants, width, height);
+ // NV12
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+ return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+ dst_stride_argb, yuvconstants, width, height);
+ }
+
+ // General case fallback creates NV12
+ align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+ if (!plane_uv)
+ return 1;
+ dst_uv = plane_uv;
+ for (y = 0; y < halfheight; ++y) {
+ WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += halfwidth * 2;
+ }
+ NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+ dst_stride_argb, yuvconstants, width, height);
+ free_aligned_buffer_64(plane_uv);
+ return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_pixel_stride_uv, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, src_pixel_stride_uv, dst_abgr,
+ dst_stride_abgr, &kYvuI601Constants, width,
+ height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToRGBARow = I422ToRGBARow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGBARow = I422ToRGBARow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGBARow = I422ToRGBARow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToRGBARow = I422ToRGBARow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB565Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToRGBARow = I422ToRGBARow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGBARow = I422ToRGBARow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGBARow = I422ToRGBARow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToRGBARow = I422ToRGBARow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToRGB24Row = I422ToRGB24Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToRGB24Row = I422ToRGB24Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I422ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB565Row = I422ToRGB565Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB565Row = I422ToRGB565Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I422ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ uint32_t dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToARGBRow = I422ToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
+ }
+ }
+#endif
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ if (!row_argb)
+ return 1;
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row_argb);
+ }
+ return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToAR30Row_C;
+
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToAR30Row = I422ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToAR30Row = I422ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride,
+ int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+ void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444ToARGBRow = I444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToARGBRow = I444ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToARGBRow = I444ToARGBRow_RVV;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 4);
+ uint8_t* temp_u_1 = row;
+ uint8_t* temp_u_2 = row + row_size;
+ uint8_t* temp_v_1 = row + row_size * 2;
+ uint8_t* temp_v_2 = row + row_size * 3;
+ if (!row)
+ return 1;
+
+ ScaleRowUp2_Linear(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear(src_v, temp_v_1, width);
+ I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+ Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+ I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ I444ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ ScaleRowUp2_Linear(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear(src_v, temp_v_1, width);
+ I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I422ToARGBMatrixLinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444ToARGBRow = I444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToARGBRow = I444ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToARGBRow = I444ToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ uint8_t* temp_u = row;
+ uint8_t* temp_v = row + row_size;
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear(src_u, temp_u, width);
+ ScaleRowUp2_Linear(src_v, temp_v, width);
+ I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToRGB24Row_C;
+ void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride,
+ int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+ void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToRGB24Row = I444ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToRGB24Row = I444ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToRGB24Row = I444ToRGB24Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToRGB24Row = I444ToRGB24Row_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToRGB24Row = I444ToRGB24Row_RVV;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 4);
+ uint8_t* temp_u_1 = row;
+ uint8_t* temp_u_2 = row + row_size;
+ uint8_t* temp_v_1 = row + row_size * 2;
+ uint8_t* temp_v_2 = row + row_size * 3;
+ if (!row)
+ return 1;
+
+ ScaleRowUp2_Linear(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear(src_v, temp_v_1, width);
+ I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+ Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+ I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ I444ToRGB24Row(src_y, temp_u_2, temp_v_2, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ ScaleRowUp2_Linear(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear(src_v, temp_v_1, width);
+ I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToAR30Row_C;
+ void (*Scale2RowUp_Bilinear_12)(
+ const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+ ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+ void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToAR30Row = I410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToAR30Row = I410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
+ uint16_t* temp_u_1 = (uint16_t*)(row);
+ uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+ uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+ uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+ if (!row)
+ return 1;
+
+ ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+ I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+ Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
+ I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ I410ToAR30Row(src_y, temp_u_2, temp_v_2, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+ I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+
+ return 0;
+}
+
+static int I210ToAR30MatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToAR30Row_C;
+ void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToAR30Row = I410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToAR30Row = I410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+ uint16_t* temp_u = (uint16_t*)(row);
+ uint16_t* temp_v = (uint16_t*)(row) + row_size;
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear_12(src_u, temp_u, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v, width);
+ I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToARGBRow_C;
+ void (*Scale2RowUp_Bilinear_12)(
+ const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+ ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+ void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToARGBRow = I410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToARGBRow = I410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
+ uint16_t* temp_u_1 = (uint16_t*)(row);
+ uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+ uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+ uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+ if (!row)
+ return 1;
+
+ ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+ I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+ Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
+ I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ I410ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+ I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I210ToARGBMatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToARGBRow_C;
+ void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToARGBRow = I410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToARGBRow = I410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+ uint16_t* temp_u = (uint16_t*)(row);
+ uint16_t* temp_v = (uint16_t*)(row) + row_size;
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear_12(src_u, temp_u, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v, width);
+ I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I420AlphaToARGBMatrixBilinear(
+ const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride,
+ int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+ void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 4);
+ uint8_t* temp_u_1 = row;
+ uint8_t* temp_u_2 = row + row_size;
+ uint8_t* temp_v_1 = row + row_size * 2;
+ uint8_t* temp_v_2 = row + row_size * 3;
+ if (!row)
+ return 1;
+
+ ScaleRowUp2_Linear(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear(src_v, temp_v_1, width);
+ I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+ Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+ I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+ I444AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ src_a += src_stride_a;
+ }
+
+ if (!(height & 1)) {
+ ScaleRowUp2_Linear(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear(src_v, temp_v_1, width);
+ I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ uint8_t* temp_u = row;
+ uint8_t* temp_v = row + row_size;
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear(src_u, temp_u, width);
+ ScaleRowUp2_Linear(src_v, temp_v, width);
+ I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I010AlphaToARGBMatrixBilinear(
+ const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I410AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*Scale2RowUp_Bilinear_12)(
+ const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+ ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+ void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+ ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
+ uint16_t* temp_u_1 = (uint16_t*)(row);
+ uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+ uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+ uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+ if (!row)
+ return 1;
+
+ ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+ I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+ Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
+ I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+ I410AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+ ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+ I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I410AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*ScaleRowUp2_Linear)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+ uint16_t* temp_u = (uint16_t*)(row);
+ uint16_t* temp_v = (uint16_t*)(row) + row_size;
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear(src_u, temp_u, width);
+ ScaleRowUp2_Linear(src_v, temp_v, width);
+ I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
+ void (*Scale2RowUp_Bilinear_16)(
+ const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+ ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToARGBRow = P410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToARGBRow = P410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToARGBRow = P410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToARGBRow = P410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (2 * width + 31) & ~31;
+ align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+ uint16_t* temp_uv_1 = (uint16_t*)(row);
+ uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
+ if (!row)
+ return 1;
+
+ Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+ P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
+ P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ P410ToARGBRow(src_y, temp_uv_2, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+ P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P210ToARGBMatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
+ void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+ int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToARGBRow = P410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToARGBRow = P410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToARGBRow = P410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToARGBRow = P410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ const int row_size = (2 * width + 31) & ~31;
+ align_buffer_64(row, row_size * sizeof(uint16_t));
+ uint16_t* temp_uv = (uint16_t*)(row);
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear(src_uv, temp_uv, width);
+ P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
+ void (*Scale2RowUp_Bilinear_16)(
+ const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+ ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToAR30Row = P410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToAR30Row = P410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToAR30Row = P410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToAR30Row = P410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (2 * width + 31) & ~31;
+ align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+ uint16_t* temp_uv_1 = (uint16_t*)(row);
+ uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
+ if (!row)
+ return 1;
+
+ Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+ P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
+ P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ P410ToAR30Row(src_y, temp_uv_2, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+ P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P210ToAR30MatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
+ void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+ int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToAR30Row = P410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToAR30Row = P410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToAR30Row = P410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToAR30Row = P410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ const int row_size = (2 * width + 31) & ~31;
+ align_buffer_64(row, row_size * sizeof(uint16_t));
+ uint16_t* temp_uv = (uint16_t*)(row);
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear(src_uv, temp_uv, width);
+ P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToRGB24Row_C;
+ void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_Any_C;
+ assert(yuvconstants);
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToRGB24Row = I444ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToRGB24Row = I444ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToRGB24Row = I444ToRGB24Row_RVV;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int row_size = (width + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ uint8_t* temp_u = row;
+ uint8_t* temp_v = row + row_size;
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp2_Linear(src_u, temp_u, width);
+ ScaleRowUp2_Linear(src_v, temp_v, width);
+ I444ToRGB24Row(src_y, temp_u, temp_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I422ToRGB24MatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I420ToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ return I420ToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ case kFilterLinear:
+ // Actually we can do this, but probably there's no usage.
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I422ToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I422ToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ yuvconstants, width, height);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return I420ToRGB24MatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I010ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ yuvconstants, width, height);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return I010ToAR30MatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_ar30, dst_stride_ar30, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I210ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I210ToAR30MatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_ar30, dst_stride_ar30, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I010ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return I010ToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I210ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I210ToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return I420AlphaToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I422AlphaToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I422AlphaToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I010AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return I010AlphaToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I210AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I210AlphaToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ }
+
+ return -1;
+}
+
+// TODO(fb): Verify this function works correctly. P010 is like NV12 but 10 bit
+// UV is biplanar.
+LIBYUV_API
+int P010ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_argb, dst_stride_argb, yuvconstants, width,
+ height);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv,
+ src_stride_uv, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P210ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P210ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_argb, dst_stride_argb, yuvconstants, width,
+ height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return P210ToARGBMatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P010ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_ar30, dst_stride_ar30, yuvconstants, width,
+ height);
+ case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0
+ case kFilterBilinear:
+ case kFilterBox:
+ return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv,
+ src_stride_uv, dst_ar30, dst_stride_ar30,
+ yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P210ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P210ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_ar30, dst_stride_ar30, yuvconstants, width,
+ height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return P210ToAR30MatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_ar30, dst_stride_ar30, yuvconstants,
+ width, height);
+ }
+
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/convert_from.cc b/source/convert_from.cc
new file mode 100644
index 00000000..e69da9e9
--- /dev/null
+++ b/source/convert_from.cc
@@ -0,0 +1,910 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h" // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
+
+static int I420ToI4xx(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int dst_uv_width,
+ int dst_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+ const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+ int r;
+ if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+ dst_uv_height <= 0) {
+ return -1;
+ }
+ if (dst_y) {
+ r = ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return r;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+ halfheight);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// Convert 8 bit YUV to 12 bit.
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth,
+ halfheight);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int dst_uv_width = (Abs(width) + 1) >> 1;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int dst_uv_width = Abs(width);
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
+}
+
+// 420 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int r;
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
+ Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
+ Abs(height), kFilterBilinear);
+ return r;
+}
+
+// 422 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int r;
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+ dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+ dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+ return r;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int r;
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ }
+ r = ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+ dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+ dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+ return r;
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_yuy2 + dst_stride_yuy2, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2 * 2;
+ }
+ if (height & 1) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_uyvy + dst_stride_uyvy, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy * 2;
+ }
+ if (height & 1) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+ halfwidth, halfheight);
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8_t* y,
+ int y_stride,
+ const uint8_t* u,
+ int u_stride,
+ const uint8_t* v,
+ int v_stride,
+ uint8_t* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
+ int r = 0;
+ if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
+ return -1;
+ }
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_UYVY:
+ r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_RGBP:
+ r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_RGBO:
+ r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_R444:
+ r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_24BG:
+ r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
+ break;
+ case FOURCC_RAW:
+ r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
+ break;
+ case FOURCC_ARGB:
+ r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_BGRA:
+ r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_ABGR:
+ r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_RGBA:
+ r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_AR30:
+ r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_I400:
+ r = I400Copy(y, y_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ case FOURCC_NV12: {
+ int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8_t* dst_uv = dst_sample + dst_y_stride * height;
+ r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_uv,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ }
+ case FOURCC_NV21: {
+ int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8_t* dst_vu = dst_sample + dst_y_stride * height;
+ r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_vu,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ }
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
+ if (format == FOURCC_YV12) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * halfheight;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * halfheight;
+ }
+ r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+ width, height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
+ if (format == FOURCC_YV16) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * height;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * height;
+ }
+ r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+ width, height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
+ if (format == FOURCC_YV24) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + dst_sample_stride * height;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + dst_sample_stride * height;
+ }
+ r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+ dst_sample_stride, width, height);
+ break;
+ }
+ // Formats not supported - MJPG, biplanar, some rgb formats.
+ default:
+ return -1; // unknown fourcc - return failure code.
+ }
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/convert_from_argb.cc b/source/convert_from_argb.cc
index fbcd039d..b45de8c8 100644
--- a/files/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -76,11 +76,19 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOUV444ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToUV444Row = ARGBToUV444Row_MMI;
+#if defined(HAS_ARGBTOUV444ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV444Row = ARGBToUV444Row_LASX;
}
}
#endif
@@ -103,7 +111,7 @@ int ARGBToI444(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -116,14 +124,27 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -170,30 +191,42 @@ int ARGBToI422(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -206,40 +239,51 @@ int ARGBToI422(const uint8_t* src_argb,
}
}
#endif
-
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ ARGBToUVRow = ARGBToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_LSX;
+ ARGBToUVRow = ARGBToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
@@ -279,74 +323,89 @@ int ARGBToNV12(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -358,11 +417,19 @@ int ARGBToNV12(const uint8_t* src_argb,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
+ if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -379,18 +446,25 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow_ = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
}
}
#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow_ = MergeUVRow_RVV;
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+ if (!row_u)
+ return 1;
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -439,30 +513,42 @@ int ARGBToNV21(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -475,39 +561,51 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ ARGBToUVRow = ARGBToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_LSX;
+ ARGBToUVRow = ARGBToUVRow_LSX;
}
}
#endif
-
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -519,11 +617,19 @@ int ARGBToNV21(const uint8_t* src_argb,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
+ if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(halfwidth, 64)) {
+ MergeUVRow_ = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -540,18 +646,25 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow_ = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
}
}
#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow_ = MergeUVRow_RVV;
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+ if (!row_u)
+ return 1;
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -599,30 +712,42 @@ int ABGRToNV12(const uint8_t* src_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ABGRToUVRow = ABGRToUVRow_Any_AVX2;
ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ABGRToUVRow = ABGRToUVRow_AVX2;
ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_NEON;
}
}
@@ -635,38 +760,227 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_MSA;
}
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYRow = ABGRToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ABGRToYRow = ABGRToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYRow = ABGRToYRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ABGRToYRow = ABGRToYRow_RVV;
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(halfwidth, 64)) {
+ MergeUVRow_ = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow_ = MergeUVRow_RVV;
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+ if (!row_u)
+ return 1;
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
}
#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToYRow = ABGRToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_MMI;
+#if defined(HAS_ABGRTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYRow = ABGRToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_LSX;
}
}
#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_MMI;
+#if defined(HAS_ABGRTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ABGRToYRow = ABGRToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYRow = ABGRToYRow_LASX;
}
}
#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ABGRToYRow = ABGRToYRow_RVV;
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -678,11 +992,19 @@ int ABGRToNV12(const uint8_t* src_abgr,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
+ if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(halfwidth, 64)) {
+ MergeUVRow_ = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -699,31 +1021,38 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow_ = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
}
}
#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow_ = MergeUVRow_RVV;
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+ if (!row_u)
+ return 1;
for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
ABGRToYRow(src_abgr, dst_y, width);
ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
src_abgr += src_stride_abgr * 2;
dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
+ dst_vu += dst_stride_vu;
}
if (height & 1) {
ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
ABGRToYRow(src_abgr, dst_y, width);
}
free_aligned_buffer_64(row_u);
@@ -764,30 +1093,42 @@ int ARGBToYUY2(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -800,38 +1141,51 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ ARGBToUVRow = ARGBToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_LSX;
+ ARGBToUVRow = ARGBToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -864,11 +1218,19 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToYUY2Row = I422ToYUY2Row_MMI;
+#if defined(HAS_I422TOYUY2ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_LASX;
}
}
#endif
@@ -878,6 +1240,8 @@ int ARGBToYUY2(const uint8_t* src_argb,
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
uint8_t* row_u = row_y + ((width + 63) & ~63);
uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+ if (!row_y)
+ return 1;
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -925,30 +1289,42 @@ int ARGBToUYVY(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -961,38 +1337,51 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ ARGBToUVRow = ARGBToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_LSX;
+ ARGBToUVRow = ARGBToUVRow_LSX;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -1025,11 +1414,19 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToUYVYRow = I422ToUYVYRow_MMI;
+#if defined(HAS_I422TOUYVYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
}
}
#endif
@@ -1039,6 +1436,8 @@ int ARGBToUYVY(const uint8_t* src_argb,
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
uint8_t* row_u = row_y + ((width + 63) & ~63);
uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+ if (!row_y)
+ return 1;
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -1097,7 +1496,7 @@ int ARGBToI400(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -1110,14 +1509,27 @@ int ARGBToI400(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYRow(src_argb, dst_y, width);
@@ -1127,6 +1539,7 @@ int ARGBToI400(const uint8_t* src_argb,
return 0;
}
+#ifndef __riscv
// Shuffle table for converting ARGB to RGBA.
static const uvec8 kShuffleMaskARGBToRGBA = {
3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
@@ -1142,6 +1555,47 @@ int ARGBToRGBA(const uint8_t* src_argb,
return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
(const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
}
+#else
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGBARow)(const uint8_t* src_argb, uint8_t* dst_rgba, int width) =
+ ARGBToRGBARow_C;
+ if (!src_argb || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgba = 0;
+ }
+
+#if defined(HAS_ARGBTORGBAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToRGBARow = ARGBToRGBARow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGBARow(src_argb, dst_rgba, width);
+ src_argb += src_stride_argb;
+ dst_rgba += dst_stride_rgba;
+ }
+ return 0;
+}
+#endif
// Convert ARGB To RGB24.
LIBYUV_API
@@ -1195,7 +1649,7 @@ int ARGBToRGB24(const uint8_t* src_argb,
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_NEON;
}
}
@@ -1208,14 +1662,27 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_LSX;
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -1282,14 +1749,27 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORAWROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRAWRow = ARGBToRAWRow_MMI;
+#if defined(HAS_ARGBTORAWROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_LSX;
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRAWRow = ARGBToRAWRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToRAWRow = ARGBToRAWRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1315,7 +1795,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
- const uint32_t dither4, int width) =
+ uint32_t dither4, int width) =
ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
@@ -1360,11 +1840,19 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
}
}
#endif
@@ -1437,11 +1925,20 @@ int ARGBToRGB565(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB565ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+#if defined(HAS_ARGBTORGB565ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_LSX;
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_LASX;
}
}
#endif
@@ -1511,11 +2008,19 @@ int ARGBToARGB1555(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOARGB1555ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+#if defined(HAS_ARGBTOARGB1555ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_LASX;
}
}
#endif
@@ -1585,11 +2090,19 @@ int ARGBToARGB4444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOARGB4444ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+#if defined(HAS_ARGBTOARGB4444ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_LASX;
}
}
#endif
@@ -1706,19 +2219,19 @@ int ARGBToJ420(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_yj,
int dst_stride_yj,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
int width,
int height) {
int y;
void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
+ uint8_t* dst_uj, uint8_t* dst_vj, int width) =
ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
ARGBToYJRow_C;
- if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1727,16 +2240,38 @@ int ARGBToJ420(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
@@ -1745,66 +2280,63 @@ int ARGBToJ420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_NEON;
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
}
#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVJRow = ARGBToUVJRow_MSA;
+ ARGBToYJRow = ARGBToYJRow_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_LASX;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_MMI;
- }
+#if defined(HAS_ARGBTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYJRow = ARGBToYJRow_RVV;
}
#endif
for (y = 0; y < height - 1; y += 2) {
- ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width);
ARGBToYJRow(src_argb, dst_yj, width);
ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
src_argb += src_stride_argb * 2;
dst_yj += dst_stride_yj * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ dst_uj += dst_stride_uj;
+ dst_vj += dst_stride_vj;
}
if (height & 1) {
- ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
ARGBToYJRow(src_argb, dst_yj, width);
}
return 0;
@@ -1816,19 +2348,19 @@ int ARGBToJ422(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_yj,
int dst_stride_yj,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
int width,
int height) {
int y;
void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
- uint8_t* dst_u, uint8_t* dst_v, int width) =
+ uint8_t* dst_uj, uint8_t* dst_vj, int width) =
ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
ARGBToYJRow_C;
- if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1839,21 +2371,27 @@ int ARGBToJ422(const uint8_t* src_argb,
}
// Coalesce rows.
if (src_stride_argb == width * 4 && dst_stride_yj == width &&
- dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+ dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
width *= height;
height = 1;
- src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+ src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
}
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
@@ -1862,10 +2400,18 @@ int ARGBToJ422(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -1878,46 +2424,51 @@ int ARGBToJ422(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
}
#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_LSX;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVJRow = ARGBToUVJRow_MSA;
+ ARGBToYJRow = ARGBToYJRow_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_LASX;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_MMI;
- }
+#if defined(HAS_ARGBTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYJRow = ARGBToYJRow_RVV;
}
#endif
for (y = 0; y < height; ++y) {
- ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
ARGBToYJRow(src_argb, dst_yj, width);
src_argb += src_stride_argb;
dst_yj += dst_stride_yj;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ dst_uj += dst_stride_uj;
+ dst_vj += dst_stride_vj;
}
return 0;
}
@@ -1966,7 +2517,7 @@ int ARGBToJ400(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -1979,12 +2530,9 @@ int ARGBToJ400(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
- }
+#if defined(HAS_ARGBTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYJRow = ARGBToYJRow_RVV;
}
#endif
@@ -1996,6 +2544,798 @@ int ARGBToJ400(const uint8_t* src_argb,
return 0;
}
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+ RGBAToYJRow_C;
+ if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgba = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGBAToYJRow = RGBAToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYJRow = RGBAToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGBAToYJRow = RGBAToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGBAToYJRow = RGBAToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGBAToYJRow = RGBAToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RGBAToYJRow = RGBAToYJRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGBAToYJRow(src_rgba, dst_yj, width);
+ src_rgba += src_stride_rgba;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+ ABGRToUVJRow_C;
+ void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+ ABGRToYJRow_C;
+ if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYJRow = ABGRToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVJRow = ABGRToUVJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYJRow = ABGRToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVJRow = ABGRToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYJRow = ABGRToYJRow_Any_MSA;
+ ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_MSA;
+ ABGRToUVJRow = ABGRToUVJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYJRow = ABGRToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ABGRToYJRow = ABGRToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYJRow = ABGRToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ABGRToYJRow = ABGRToYJRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width);
+ ABGRToYJRow(src_abgr, dst_yj, width);
+ ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_yj += dst_stride_yj * 2;
+ dst_uj += dst_stride_uj;
+ dst_vj += dst_stride_vj;
+ }
+ if (height & 1) {
+ ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+ ABGRToYJRow(src_abgr, dst_yj, width);
+ }
+ return 0;
+}
+
+// Convert ABGR to J422. (JPeg full range I422).
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_uj,
+ int dst_stride_uj,
+ uint8_t* dst_vj,
+ int dst_stride_vj,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+ ABGRToUVJRow_C;
+ void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+ ABGRToYJRow_C;
+ if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+ // Coalesce rows.
+ if (src_stride_abgr == width * 4 && dst_stride_yj == width &&
+ dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
+ }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYJRow = ABGRToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVJRow = ABGRToUVJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYJRow = ABGRToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVJRow = ABGRToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYJRow = ABGRToYJRow_Any_MSA;
+ ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVJRow = ABGRToUVJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYJRow = ABGRToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ABGRToYJRow = ABGRToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYJRow = ABGRToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ABGRToYJRow = ABGRToYJRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+ ABGRToYJRow(src_abgr, dst_yj, width);
+ src_abgr += src_stride_abgr;
+ dst_yj += dst_stride_yj;
+ dst_uj += dst_stride_uj;
+ dst_vj += dst_stride_vj;
+ }
+ return 0;
+}
+
+// Convert ABGR to J400.
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+ ABGRToYJRow_C;
+ if (!src_abgr || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+ // Coalesce rows.
+ if (src_stride_abgr == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_abgr = dst_stride_yj = 0;
+ }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYJRow = ABGRToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYJRow = ABGRToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYJRow = ABGRToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYJRow = ABGRToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYJRow = ABGRToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ABGRToYJRow = ABGRToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYJRow = ABGRToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ABGRToYJRow = ABGRToYJRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ABGRToYJRow(src_abgr, dst_yj, width);
+ src_abgr += src_stride_abgr;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+ int width) = ARGBToAR64Row_C;
+ if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_ar64 = 0;
+ }
+#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR64Row = ARGBToAR64Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR64Row = ARGBToAR64Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToAR64Row = ARGBToAR64Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToAR64Row(src_argb, dst_ar64, width);
+ src_argb += src_stride_argb;
+ dst_ar64 += dst_stride_ar64;
+ }
+ return 0;
+}
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+ int width) = ARGBToAB64Row_C;
+ if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_ab64 = 0;
+ }
+#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAB64Row = ARGBToAB64Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAB64Row = ARGBToAB64Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToAB64Row = ARGBToAB64Row_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToAB64Row(src_argb, dst_ab64, width);
+ src_argb += src_stride_argb;
+ dst_ab64 += dst_stride_ab64;
+ }
+ return 0;
+}
+
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+ defined(HAS_RAWTOYJROW_RVV)
+#define HAS_RAWTOYJROW
+#endif
+
+// RAW to JNV21 full range NV21
+LIBYUV_API
+int RAWToJNV21(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+#if defined(HAS_RAWTOYJROW)
+ void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+ uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+ RAWToUVJRow_C;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYJRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVJRow = RAWToUVJRow_Any_NEON;
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ RAWToUVJRow = RAWToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVJRow = RAWToUVJRow_Any_MSA;
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ RAWToUVJRow = RAWToUVJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToYJRow = RAWToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToYJRow = RAWToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYJRow = RAWToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToYJRow = RAWToYJRow_RVV;
+ }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else // HAS_RAWTOYJROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RAWTOYJROW
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(halfwidth, 64)) {
+ MergeUVRow_ = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow_ = MergeUVRow_RVV;
+ }
+#endif
+ {
+#if defined(HAS_RAWTOYJROW)
+ // Allocate a row of uv.
+ const int row_uv_size = ((halfwidth + 31) & ~31);
+ align_buffer_64(row_uj, row_uv_size * 2);
+ uint8_t* row_vj = row_uj + row_uv_size;
+#else
+ // Allocate row of uv and 2 rows of ARGB.
+ const int row_size = ((width * 4 + 31) & ~31);
+ const int row_uv_size = ((halfwidth + 31) & ~31);
+ align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2);
+ uint8_t* row_vj = row_uj + row_uv_size;
+ uint8_t* row = row_vj + row_uv_size;
+#endif
+ if (!row_uj)
+ return 1;
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width);
+ MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+ RAWToYJRow(src_raw, dst_y, width);
+ RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+ ARGBToUVJRow(row, row_size, row_uj, row_vj, width);
+ MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, 0, row_uj, row_vj, width);
+ MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+ RAWToYJRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVJRow(row, 0, row_uj, row_vj, width);
+ MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+ free_aligned_buffer_64(row_uj);
+ }
+ return 0;
+}
+#undef HAS_RAWTOYJROW
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/convert_jpeg.cc b/source/convert_jpeg.cc
index f440c7c2..d7556ee9 100644
--- a/files/source/convert_jpeg.cc
+++ b/source/convert_jpeg.cc
@@ -328,6 +328,140 @@ int MJPGToNV21(const uint8_t* src_mjpg,
return ret ? 0 : 1;
}
+static void JpegI420ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 since there is no UV plane.
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ // Use NV21Buffers but with UV instead of VU.
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv,
+ dst_stride_uv, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
struct ARGBBuffers {
uint8_t* argb;
int argb_stride;
diff --git a/files/source/convert_to_argb.cc b/source/convert_to_argb.cc
index bde1aa88..84df16c8 100644
--- a/files/source/convert_to_argb.cc
+++ b/source/convert_to_argb.cc
@@ -32,9 +32,6 @@ extern "C" {
// TODO(fbarchard): Add the following:
// H010ToARGB
// I010ToARGB
-// J400ToARGB
-// J422ToARGB
-// J444ToARGB
LIBYUV_API
int ConvertToARGB(const uint8_t* sample,
@@ -161,6 +158,11 @@ int ConvertToARGB(const uint8_t* sample,
r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
+ case FOURCC_J400:
+ src = sample + src_width * crop_y + crop_x;
+ r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
// Biplanar formats
case FOURCC_NV12:
@@ -178,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample,
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
dst_stride_argb, crop_width, inv_crop_height);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
- inv_crop_height);
- break;
-
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
@@ -208,6 +204,19 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_J420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_H420: {
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
@@ -221,7 +230,7 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
- case FOURCC_J420: {
+ case FOURCC_U420: {
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
@@ -229,7 +238,7 @@ int ConvertToARGB(const uint8_t* sample,
(halfwidth * crop_y + crop_x) / 2;
const uint8_t* src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
@@ -256,6 +265,18 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_J422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_H422: {
int halfwidth = (src_width + 1) / 2;
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -268,6 +289,18 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_U422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_I444:
case FOURCC_YV24: {
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -284,6 +317,40 @@ int ConvertToARGB(const uint8_t* sample,
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
+
+ case FOURCC_J444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_U444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
diff --git a/files/source/convert_to_i420.cc b/source/convert_to_i420.cc
index 584be0ac..5869ecd7 100644
--- a/files/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
switch (format) {
// Single plane formats
- case FOURCC_YUY2:
+ case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix.
+ uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+ uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+ int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+ int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
- dst_stride_u, dst_v, dst_stride_v, crop_width,
- inv_crop_height);
+ r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+ stride_u, v, stride_v, crop_width, inv_crop_height);
break;
- case FOURCC_UYVY:
+ }
+ case FOURCC_UYVY: {
+ uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+ uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+ int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+ int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
- dst_stride_u, dst_v, dst_stride_v, crop_width,
- inv_crop_height);
+ r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+ stride_u, v, stride_v, crop_width, inv_crop_height);
break;
+ }
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
@@ -179,11 +187,6 @@ int ConvertToI420(const uint8_t* sample,
dst_stride_y, dst_v, dst_stride_v, dst_u,
dst_stride_u, crop_width, inv_crop_height, rotation);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, crop_width, inv_crop_height);
- break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/files/source/cpu_id.cc b/source/cpu_id.cc
index 48e2b615..eedce16b 100644
--- a/files/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -20,7 +20,7 @@
#endif
// For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>
+#include <stdio.h> // For fopen()
#include <string.h>
#ifdef __cplusplus
@@ -40,7 +40,6 @@ extern "C" {
// cpu_info_ variable for SIMD instruction sets detected.
LIBYUV_API int cpu_info_ = 0;
-// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
@@ -75,9 +74,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
+ "mov %%ebx, %%edi \n"
"cpuid \n"
- "xchg %%edi, %%ebx \n"
+ "xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
@@ -108,14 +107,14 @@ void CpuId(int eax, int ecx, int* cpu_info) {
// }
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
// https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
+static int GetXCR0() {
int xcr0 = 0;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
@@ -129,21 +128,22 @@ int GetXCR0() {
#define GetXCR0() 0
#endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
#pragma optimize("g", on)
#endif
-// based on libvpx arm_cpudetect.c
+// Based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
- FILE* f = fopen(cpuinfo_name, "r");
+ FILE* f = fopen(cpuinfo_name, "re");
if (!f) {
// Assume Neon if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasNEON;
}
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
char* p = strstr(cpuinfo_line, " neon");
if (p && (p[5] == ' ' || p[5] == '\n')) {
@@ -162,47 +162,128 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
return 0;
}
-// TODO(fbarchard): Consider read_msa_ir().
-// TODO(fbarchard): Add unittest.
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
- const char ase[]) {
+LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
- FILE* f = fopen(cpuinfo_name, "r");
+ int flag = 0;
+ FILE* f = fopen(cpuinfo_name, "re");
if (!f) {
- // ase enabled if /proc/cpuinfo is unavailable.
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
+#if defined(__riscv_vector)
+ // Assume RVV if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasRVV;
+#else
+ return 0;
+#endif
+ }
+ memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+ if (memcmp(cpuinfo_line, "isa", 3) == 0) {
+ // ISA string must begin with rv64{i,e,g} for a 64-bit processor.
+ char* isa = strstr(cpuinfo_line, "rv64");
+ if (isa) {
+ size_t isa_len = strlen(isa);
+ char* extensions;
+ size_t extensions_len = 0;
+ size_t std_isa_len;
+ // Remove the new-line character at the end of string
+ if (isa[isa_len - 1] == '\n') {
+ isa[--isa_len] = '\0';
+ }
+ // 5 ISA characters
+ if (isa_len < 5) {
+ fclose(f);
+ return 0;
+ }
+ // Skip {i,e,g} canonical checking.
+ // Skip rvxxx
+ isa += 5;
+ // Find the very first occurrence of 's', 'x' or 'z'.
+ // To detect multi-letter standard, non-standard, and
+ // supervisor-level extensions.
+ extensions = strpbrk(isa, "zxs");
+ if (extensions) {
+ // Multi-letter extensions are seperated by a single underscore
+ // as described in RISC-V User-Level ISA V2.2.
+ char* ext = strtok(extensions, "_");
+ extensions_len = strlen(extensions);
+ while (ext) {
+ // Search for the ZVFH (Vector FP16) extension.
+ if (!strcmp(ext, "zvfh")) {
+ flag |= kCpuHasRVVZVFH;
+ }
+ ext = strtok(NULL, "_");
+ }
+ }
+ std_isa_len = isa_len - extensions_len - 5;
+ // Detect the v in the standard single-letter extensions.
+ if (memchr(isa, 'v', std_isa_len)) {
+ // The RVV implied the F extension.
+ flag |= kCpuHasRVV;
+ }
+ }
}
- if (strcmp(ase, " mmi") == 0) {
- return kCpuHasMMI;
+#if defined(__riscv_vector)
+ // Assume RVV if /proc/cpuinfo is from x86 host running QEMU.
+ else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) ||
+ (memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) {
+ fclose(f);
+ return kCpuHasRVV;
}
+#endif
+ }
+ fclose(f);
+ return flag;
+}
+
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
+ char cpuinfo_line[512];
+ int flag = 0;
+ FILE* f = fopen(cpuinfo_name, "re");
+ if (!f) {
+ // Assume nothing if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
return 0;
}
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
- if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
- char* p = strstr(cpuinfo_line, ase);
- if (p) {
- fclose(f);
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
- return 0;
+ memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+ if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ // Workaround early kernel without MSA in ASEs line.
+ if (strstr(cpuinfo_line, "Loongson-2K")) {
+ flag |= kCpuHasMSA;
}
- } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
- char* p = strstr(cpuinfo_line, "Loongson-3");
- if (p) {
- fclose(f);
- if (strcmp(ase, " mmi") == 0) {
- return kCpuHasMMI;
- }
- return 0;
+ }
+ if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+ if (strstr(cpuinfo_line, "msa")) {
+ flag |= kCpuHasMSA;
}
+ // ASEs is the last line, so we can break here.
+ break;
}
}
fclose(f);
- return 0;
+ return flag;
}
+#define LOONGARCH_CFG2 0x2
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if defined(__loongarch__)
+LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
+ int flag = 0;
+ uint32_t cfg2 = 0;
+
+ __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2));
+
+ if (cfg2 & LOONGARCH_CFG2_LSX)
+ flag |= kCpuHasLSX;
+
+ if (cfg2 & LOONGARCH_CFG2_LASX)
+ flag |= kCpuHasLASX;
+ return flag;
+}
+#endif
+
static SAFEBUFFERS int GetCpuFlags(void) {
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
@@ -211,10 +292,12 @@ static SAFEBUFFERS int GetCpuFlags(void) {
int cpu_info0[4] = {0, 0, 0, 0};
int cpu_info1[4] = {0, 0, 0, 0};
int cpu_info7[4] = {0, 0, 0, 0};
+ int cpu_einfo7[4] = {0, 0, 0, 0};
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
+ CpuId(7, 1, cpu_einfo7);
}
cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
@@ -227,7 +310,9 @@ static SAFEBUFFERS int GetCpuFlags(void) {
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
- ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+ ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0) |
+ ((cpu_einfo7[0] & 0x00000010) ? kCpuHasAVXVNNI : 0) |
+ ((cpu_einfo7[3] & 0x00000010) ? kCpuHasAVXVNNIINT8 : 0);
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
@@ -235,20 +320,20 @@ static SAFEBUFFERS int GetCpuFlags(void) {
cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
- cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
- cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+ cpu_info |= (cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0;
}
}
#endif
#if defined(__mips__) && defined(__linux__)
-#if defined(__mips_msa)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
-#elif defined(_MIPS_ARCH_LOONGSON3A)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi");
-#endif
+ cpu_info = MipsCpuCaps("/proc/cpuinfo");
cpu_info |= kCpuHasMIPS;
#endif
+#if defined(__loongarch__) && defined(__linux__)
+ cpu_info = LoongarchCpuCaps();
+ cpu_info |= kCpuHasLOONGARCH;
+#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
@@ -267,6 +352,10 @@ static SAFEBUFFERS int GetCpuFlags(void) {
#endif
cpu_info |= kCpuHasARM;
#endif // __arm__
+#if defined(__riscv) && defined(__linux__)
+ cpu_info = RiscvCpuCaps("/proc/cpuinfo");
+ cpu_info |= kCpuHasRISCV;
+#endif // __riscv
cpu_info |= kCpuInitialized;
return cpu_info;
}
diff --git a/files/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc
index 5c5e5ead..0141da8a 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
}
buf_.data = src;
- buf_.len = static_cast<int>(src_len);
+ buf_.len = (int)src_len;
buf_vec_.pos = 0;
decompress_struct_->client_data = &buf_vec_;
#ifdef HAVE_SETJMP
@@ -417,7 +417,6 @@ void init_source(j_decompress_ptr cinfo) {
boolean fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
- assert(0 && "No more data");
// ERROR: No more data
return FALSE;
}
@@ -429,8 +428,8 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
jpeg_source_mgr* src = cinfo->src;
- size_t bytes = static_cast<size_t>(num_bytes);
- if(bytes > src->bytes_in_buffer) {
+ size_t bytes = (size_t)num_bytes;
+ if (bytes > src->bytes_in_buffer) {
src->next_input_byte = nullptr;
src->bytes_in_buffer = 0;
} else {
diff --git a/files/source/mjpeg_validate.cc b/source/mjpeg_validate.cc
index ba0a03ab..ba0a03ab 100644
--- a/files/source/mjpeg_validate.cc
+++ b/source/mjpeg_validate.cc
diff --git a/files/source/planar_functions.cc b/source/planar_functions.cc
index 9cab230f..1c94e260 100644
--- a/files/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -10,6 +10,7 @@
#include "libyuv/planar_functions.h"
+#include <assert.h>
#include <string.h> // for memset()
#include "libyuv/cpu_id.h"
@@ -34,6 +35,9 @@ void CopyPlane(const uint8_t* src_y,
int height) {
int y;
void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -71,6 +75,11 @@ void CopyPlane(const uint8_t* src_y,
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
+#if defined(HAS_COPYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ CopyRow = CopyRow_RVV;
+ }
+#endif
// Copy plane
for (y = 0; y < height; ++y) {
@@ -80,8 +89,6 @@ void CopyPlane(const uint8_t* src_y,
}
}
-// TODO(fbarchard): Consider support for negative height.
-// TODO(fbarchard): Consider stride measured in bytes.
LIBYUV_API
void CopyPlane_16(const uint16_t* src_y,
int src_stride_y,
@@ -89,36 +96,8 @@ void CopyPlane_16(const uint16_t* src_y,
int dst_stride_y,
int width,
int height) {
- int y;
- void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
- // Coalesce rows.
- if (src_stride_y == width && dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_y = 0;
- }
-#if defined(HAS_COPYROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_16_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_16_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_16_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_16_NEON;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height; ++y) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
+ CopyPlane((const uint8_t*)src_y, src_stride_y * 2, (uint8_t*)dst_y,
+ dst_stride_y * 2, width * 2, height);
}
// Convert a plane of 16 bit data to 8 bit
@@ -134,6 +113,9 @@ void Convert16To8Plane(const uint16_t* src_y,
void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
int width) = Convert16To8Row_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -146,6 +128,14 @@ void Convert16To8Plane(const uint16_t* src_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
+#if defined(HAS_CONVERT16TO8ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Convert16To8Row = Convert16To8Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ Convert16To8Row = Convert16To8Row_NEON;
+ }
+ }
+#endif
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;
@@ -177,13 +167,16 @@ void Convert8To16Plane(const uint8_t* src_y,
int src_stride_y,
uint16_t* dst_y,
int dst_stride_y,
- int scale, // 16384 for 10 bits
+ int scale, // 1024 for 10 bits
int width,
int height) {
int y;
void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
int width) = Convert8To16Row_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -238,9 +231,12 @@ int I422Copy(const uint8_t* src_y,
int width,
int height) {
int halfwidth = (width + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -276,7 +272,8 @@ int I444Copy(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -298,6 +295,88 @@ int I444Copy(const uint8_t* src_y,
return 0;
}
+// Copy I210.
+LIBYUV_API
+int I210Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
+// Copy I410.
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+}
+
// Copy I400.
LIBYUV_API
int I400ToI400(const uint8_t* src_y,
@@ -349,6 +428,56 @@ int I420ToI400(const uint8_t* src_y,
return 0;
}
+// Copy NV12. Supports inverting.
+LIBYUV_API
+int NV12Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+ halfheight);
+ return 0;
+}
+
+// Copy NV21. Supports inverting.
+LIBYUV_API
+int NV21Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
// Support function for NV12 etc UV channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
@@ -363,6 +492,9 @@ void SplitUVPlane(const uint8_t* src_uv,
int y;
void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -410,14 +542,19 @@ void SplitUVPlane(const uint8_t* src_uv,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitUVRow = SplitUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SplitUVRow = SplitUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_LSX;
}
}
#endif
+#if defined(HAS_SPLITUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ SplitUVRow = SplitUVRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
// Copy a row of UV.
@@ -440,6 +577,9 @@ void MergeUVPlane(const uint8_t* src_u,
int y;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -464,11 +604,19 @@ void MergeUVPlane(const uint8_t* src_u,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
+ if (IS_ALIGNED(width, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW)) {
+ MergeUVRow = MergeUVRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ MergeUVRow = MergeUVRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow = MergeUVRow_Any_NEON;
@@ -485,14 +633,19 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MergeUVRow = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_LSX;
}
}
#endif
+#if defined(HAS_MERGEUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeUVRow = MergeUVRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of UV.
@@ -503,6 +656,289 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
+// Support function for P010 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
+ uint16_t* dst_v, int depth, int width) =
+ SplitUVRow_16_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow_16 = SplitUVRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow_16 = SplitUVRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow_16 = SplitUVRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of UV.
+ SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+}
+
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
+ uint16_t* dst_uv, int depth, int width) =
+ MergeUVRow_16_C;
+ assert(depth >= 8);
+ assert(depth <= 16);
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_u == width && src_stride_v == width &&
+ dst_stride_uv == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ MergeUVRow_16 = MergeUVRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_16 = MergeUVRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeUVRow_16 = MergeUVRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+}
+
+// Convert plane from lsb to msb
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ int scale = 1 << (16 - depth);
+ void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+ int width) = MultiplyRow_16_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+
+#if defined(HAS_MULTIPLYROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MultiplyRow_16 = MultiplyRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MULTIPLYROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MultiplyRow_16 = MultiplyRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MultiplyRow_16 = MultiplyRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MultiplyRow_16(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert plane from msb to lsb
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ int scale = 1 << depth;
+ void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+ int width) = DivideRow_16_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+
+#if defined(HAS_DIVIDEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ DivideRow = DivideRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ DivideRow = DivideRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_DIVIDEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ DivideRow = DivideRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DivideRow = DivideRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ DivideRow(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ SwapUVRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_vu = 0;
+ }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SwapUVRow = SwapUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SwapUVRow = SwapUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SwapUVRow = SwapUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SwapUVRow = SwapUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SwapUVRow(src_uv, dst_vu, width);
+ src_uv += src_stride_uv;
+ dst_vu += dst_stride_vu;
+ }
+}
+
// Convert NV21 to NV12.
LIBYUV_API
int NV21ToNV12(const uint8_t* src_y,
@@ -515,51 +951,286 @@ int NV21ToNV12(const uint8_t* src_y,
int dst_stride_uv,
int width,
int height) {
- int y;
- void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
- UVToVURow_C;
-
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
+
if (!src_vu || !dst_uv || width <= 0 || height == 0) {
return -1;
}
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
src_vu = src_vu + (halfheight - 1) * src_stride_vu;
- src_stride_y = -src_stride_y;
src_stride_vu = -src_stride_vu;
}
- // Coalesce rows.
- if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_vu = dst_stride_uv = 0;
+
+ SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// Test if tile_height is a power of 2 (16 or 32)
+#define IS_POWEROFTWO(x) (!((x) & ((x)-1)))
+
+// Detile a plane of data
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MM21.
+// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
+// TODO: More detile row functions.
+LIBYUV_API
+int DetilePlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int tile_height) {
+ const ptrdiff_t src_tile_stride = 16 * tile_height;
+ int y;
+ void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
+ int width) = DetileRow_C;
+ if (!src_y || !dst_y || width <= 0 || height == 0 ||
+ !IS_POWEROFTWO(tile_height)) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
}
-#if defined(HAS_UVToVUROW_NEON)
+#if defined(HAS_DETILEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ DetileRow = DetileRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow = DetileRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_DETILEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- UVToVURow = UVToVURow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- UVToVURow = UVToVURow_NEON;
+ DetileRow = DetileRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow = DetileRow_NEON;
}
}
#endif
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+ // Detile plane
+ for (y = 0; y < height; ++y) {
+ DetileRow(src_y, src_tile_stride, dst_y, width);
+ dst_y += dst_stride_y;
+ src_y += 16;
+ // Advance to next row of tiles.
+ if ((y & (tile_height - 1)) == (tile_height - 1)) {
+ src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+ }
+ }
+ return 0;
+}
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MT2T.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int tile_height) {
+ const ptrdiff_t src_tile_stride = 16 * tile_height;
+ int y;
+ void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
+ uint16_t* dst, int width) = DetileRow_16_C;
+ if (!src_y || !dst_y || width <= 0 || height == 0 ||
+ !IS_POWEROFTWO(tile_height)) {
+ return -1;
}
- for (y = 0; y < halfheight; ++y) {
- UVToVURow(src_vu, dst_uv, halfwidth);
- src_vu += src_stride_vu;
- dst_uv += dst_stride_uv;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+
+#if defined(HAS_DETILEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ DetileRow_16 = DetileRow_16_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow_16 = DetileRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_DETILEROW_16_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ DetileRow_16 = DetileRow_16_Any_AVX;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow_16 = DetileRow_16_AVX;
+ }
+ }
+#endif
+#if defined(HAS_DETILEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ DetileRow_16 = DetileRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow_16 = DetileRow_16_NEON;
+ }
+ }
+#endif
+
+ // Detile plane
+ for (y = 0; y < height; ++y) {
+ DetileRow_16(src_y, src_tile_stride, dst_y, width);
+ dst_y += dst_stride_y;
+ src_y += 16;
+ // Advance to next row of tiles.
+ if ((y & (tile_height - 1)) == (tile_height - 1)) {
+ src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+ }
}
return 0;
}
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int tile_height) {
+ const ptrdiff_t src_tile_stride = 16 * tile_height;
+ int y;
+ void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ DetileSplitUVRow_C;
+ assert(src_stride_uv >= 0);
+ assert(tile_height > 0);
+ assert(src_stride_uv > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_stride_u = -dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_v = -dst_stride_v;
+ }
+
+#if defined(HAS_DETILESPLITUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ DetileSplitUVRow = DetileSplitUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_DETILESPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ DetileSplitUVRow = DetileSplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DetileSplitUVRow = DetileSplitUVRow_NEON;
+ }
+ }
+#endif
+
+ // Detile plane
+ for (y = 0; y < height; ++y) {
+ DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += 16;
+ // Advance to next row of tiles.
+ if ((y & (tile_height - 1)) == (tile_height - 1)) {
+ src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height;
+ }
+ }
+}
+
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height,
+ int tile_height) {
+ const ptrdiff_t src_y_tile_stride = 16 * tile_height;
+ const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2;
+ int y;
+ void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2, int width) = DetileToYUY2_C;
+ assert(src_stride_y >= 0);
+ assert(src_stride_y > 0);
+ assert(src_stride_uv >= 0);
+ assert(src_stride_uv > 0);
+ assert(tile_height > 0);
+
+ if (width <= 0 || height == 0 || tile_height <= 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+
+#if defined(HAS_DETILETOYUY2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ DetileToYUY2 = DetileToYUY2_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DetileToYUY2 = DetileToYUY2_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_DETILETOYUY2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ DetileToYUY2 = DetileToYUY2_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ DetileToYUY2 = DetileToYUY2_SSE2;
+ }
+ }
+#endif
+
+ // Detile plane
+ for (y = 0; y < height; ++y) {
+ DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2,
+ width);
+ dst_yuy2 += dst_stride_yuy2;
+ src_y += 16;
+
+ if (y & 0x1)
+ src_uv += 16;
+
+ // Advance to next row of tiles.
+ if ((y & (tile_height - 1)) == (tile_height - 1)) {
+ src_y = src_y - src_y_tile_stride + src_stride_y * tile_height;
+ src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2);
+ }
+ }
+}
+
// Support function for NV12 etc RGB channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
@@ -576,6 +1247,9 @@ void SplitRGBPlane(const uint8_t* src_rgb,
int y;
void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
uint8_t* dst_b, int width) = SplitRGBRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -609,12 +1283,9 @@ void SplitRGBPlane(const uint8_t* src_rgb,
}
}
#endif
-#if defined(HAS_SPLITRGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitRGBRow = SplitRGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- SplitRGBRow = SplitRGBRow_MMI;
- }
+#if defined(HAS_SPLITRGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ SplitRGBRow = SplitRGBRow_RVV;
}
#endif
@@ -643,6 +1314,9 @@ void MergeRGBPlane(const uint8_t* src_r,
void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
const uint8_t* src_b, uint8_t* dst_rgb, int width) =
MergeRGBRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Coalesce rows.
// Negative height means invert the image.
if (height < 0) {
@@ -673,12 +1347,9 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
#endif
-#if defined(HAS_MERGERGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeRGBRow = MergeRGBRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MergeRGBRow = MergeRGBRow_MMI;
- }
+#if defined(HAS_MERGERGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeRGBRow = MergeRGBRow_RVV;
}
#endif
@@ -692,67 +1363,694 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- int width,
- int height) {
+LIBYUV_NOINLINE
+static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
int y;
- void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+ uint8_t* dst_b, uint8_t* dst_a, int width) =
+ SplitARGBRow_C;
+
+ assert(height > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ if (src_stride_argb == width * 4 && dst_stride_r == width &&
+ dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
+ dst_stride_a = 0;
+ }
+
+#if defined(HAS_SPLITARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitARGBRow = SplitARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ SplitARGBRow = SplitARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SplitARGBRow = SplitARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ SplitARGBRow = SplitARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitARGBRow = SplitARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitARGBRow = SplitARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitARGBRow = SplitARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitARGBRow = SplitARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ SplitARGBRow = SplitARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
+ dst_r += dst_stride_r;
+ dst_g += dst_stride_g;
+ dst_b += dst_stride_b;
+ dst_a += dst_stride_a;
+ src_argb += src_stride_argb;
+ }
+}
+
+LIBYUV_NOINLINE
+static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int y;
+ void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+ uint8_t* dst_b, int width) = SplitXRGBRow_C;
+ assert(height > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ if (src_stride_argb == width * 4 && dst_stride_r == width &&
+ dst_stride_g == width && dst_stride_b == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+ }
+
+#if defined(HAS_SPLITXRGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitXRGBRow = SplitXRGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ SplitXRGBRow = SplitXRGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ SplitXRGBRow = SplitXRGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitXRGBRow = SplitXRGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitXRGBRow = SplitXRGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitXRGBRow = SplitXRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitXRGBRow = SplitXRGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ SplitXRGBRow = SplitXRGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
+ dst_r += dst_stride_r;
+ dst_g += dst_stride_g;
+ dst_b += dst_stride_b;
+ src_argb += src_stride_argb;
+ }
+}
+
+LIBYUV_API
+void SplitARGBPlane(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
+ dst_r = dst_r + (height - 1) * dst_stride_r;
+ dst_g = dst_g + (height - 1) * dst_stride_g;
+ dst_b = dst_b + (height - 1) * dst_stride_b;
+ dst_a = dst_a + (height - 1) * dst_stride_a;
+ dst_stride_r = -dst_stride_r;
+ dst_stride_g = -dst_stride_g;
+ dst_stride_b = -dst_stride_b;
+ dst_stride_a = -dst_stride_a;
}
-#if defined(HAS_MIRRORROW_NEON)
+
+ if (dst_a == NULL) {
+ SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+ dst_stride_g, dst_b, dst_stride_b, width, height);
+ } else {
+ SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+ dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a,
+ width, height);
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGBPlaneAlpha(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+ const uint8_t* src_b, const uint8_t* src_a,
+ uint8_t* dst_argb, int width) = MergeARGBRow_C;
+
+ assert(height > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ src_stride_a == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeARGBRow = MergeARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ MergeARGBRow = MergeARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeARGBRow = MergeARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeARGBRow = MergeARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_Any_NEON;
+ MergeARGBRow = MergeARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
+ MergeARGBRow = MergeARGBRow_NEON;
}
}
#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- MirrorRow = MirrorRow_Any_SSSE3;
+#if defined(HAS_MERGEARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeARGBRow = MergeARGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ src_a += src_stride_a;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGBPlaneOpaque(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+ const uint8_t* src_b, uint8_t* dst_argb, int width) =
+ MergeXRGBRow_C;
+
+ assert(height > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEXRGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeXRGBRow = MergeXRGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXRGBRow = MergeXRGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXRGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXRGBRow = MergeXRGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSSE3;
+ MergeXRGBRow = MergeXRGBRow_AVX2;
}
}
#endif
-#if defined(HAS_MIRRORROW_AVX2)
+#if defined(HAS_MERGEXRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeXRGBRow = MergeXRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXRGBRow = MergeXRGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXRGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ MergeXRGBRow = MergeXRGBRow_RVV;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_API
+void MergeARGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+
+ if (src_a == NULL) {
+ MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, dst_argb, dst_stride_argb, width,
+ height);
+ } else {
+ MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, width, height);
+ }
+}
+
+// TODO(yuan): Support 2 bit alpha channel.
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, uint8_t* dst_ar30, int depth,
+ int width) = MergeXR30Row_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_ar30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
+ }
+#if defined(HAS_MERGEXR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- MirrorRow = MirrorRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
+ MergeXR30Row = MergeXR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXR30Row = MergeXR30Row_AVX2;
}
}
#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
+#if defined(HAS_MERGEXR30ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (depth == 10) {
+ MergeXR30Row = MergeXR30Row_10_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXR30Row = MergeXR30Row_10_NEON;
+ }
+ } else {
+ MergeXR30Row = MergeXR30Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXR30Row = MergeXR30Row_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_ar30 += dst_stride_ar30;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneAlpha(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, const uint16_t* src_a,
+ uint16_t* dst_argb, int depth, int width) =
+ MergeAR64Row_C;
+
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ src_stride_a == width && dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+ dst_stride_ar64 = 0;
+ }
+#if defined(HAS_MERGEAR64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeAR64Row = MergeAR64Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeAR64Row = MergeAR64Row_AVX2;
}
}
#endif
-#if defined(HAS_MIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MirrorRow = MirrorRow_Any_MMI;
+#if defined(HAS_MERGEAR64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeAR64Row = MergeAR64Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- MirrorRow = MirrorRow_MMI;
+ MergeAR64Row = MergeAR64Row_NEON;
}
}
#endif
- // Mirror plane
for (y = 0; y < height; ++y) {
- MirrorRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
+ MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ src_a += src_stride_a;
+ dst_ar64 += dst_stride_ar64;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneOpaque(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, uint16_t* dst_argb, int depth,
+ int width) = MergeXR64Row_C;
+
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
+ }
+#if defined(HAS_MERGEXR64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXR64Row = MergeXR64Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXR64Row = MergeXR64Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXR64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeXR64Row = MergeXR64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXR64Row = MergeXR64Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_ar64 += dst_stride_ar64;
+ }
+}
+
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
+ dst_stride_ar64 = -dst_stride_ar64;
+ }
+
+ if (src_a == NULL) {
+ MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, dst_ar64, dst_stride_ar64, width, height,
+ depth);
+ } else {
+ MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, src_a, src_stride_a, dst_ar64,
+ dst_stride_ar64, width, height, depth);
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, const uint16_t* src_a,
+ uint8_t* dst_argb, int depth, int width) =
+ MergeARGB16To8Row_C;
+
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ src_stride_a == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEARGB16TO8ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ src_a += src_stride_a;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, uint8_t* dst_argb, int depth,
+ int width) = MergeXRGB16To8Row_C;
+
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+
+ if (src_a == NULL) {
+ MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, dst_argb, dst_stride_argb, width,
+ height, depth);
+ } else {
+ MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, width, height, depth);
}
}
@@ -820,7 +2118,7 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
@@ -830,13 +2128,23 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- YUY2ToYRow = YUY2ToYRow_Any_MMI;
- YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_MMI;
- YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LSX;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_LSX;
+ YUY2ToUV422Row = YUY2ToUV422Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_LASX;
+ YUY2ToUV422Row = YUY2ToUV422Row_LASX;
}
}
#endif
@@ -916,7 +2224,7 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MSA)
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
@@ -926,13 +2234,23 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- UYVYToYRow = UYVYToYRow_Any_MMI;
- UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ UYVYToYRow = UYVYToYRow_Any_LSX;
+ UYVYToUV422Row = UYVYToUV422Row_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_MMI;
- UYVYToUV422Row = UYVYToUV422Row_MMI;
+ UYVYToYRow = UYVYToYRow_LSX;
+ UYVYToUV422Row = UYVYToUV422Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ UYVYToYRow = UYVYToYRow_Any_LASX;
+ UYVYToUV422Row = UYVYToUV422Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_LASX;
+ UYVYToUV422Row = UYVYToUV422Row_LASX;
}
}
#endif
@@ -1006,23 +2324,238 @@ int YUY2ToY(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- YUY2ToYRow = YUY2ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_MMI;
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Convert UYVY to Y.
+LIBYUV_API
+int UYVYToY(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+ UYVYToYRow_C;
+ if (!src_uyvy || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = 0;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ UYVYToYRow = UYVYToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_LSX;
}
}
#endif
for (y = 0; y < height; ++y) {
- YUY2ToYRow(src_yuy2, dst_y, width);
- src_yuy2 += src_stride_yuy2;
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
dst_y += dst_stride_y;
}
return 0;
}
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MirrorRow = MirrorRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorRow = MirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_LASX;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+ MirrorUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorUVRow = MirrorUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorUVRow = MirrorUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorUVRow = MirrorUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorUVRow = MirrorUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MirrorUVRow = MirrorUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorUVRow = MirrorUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_LASX;
+ }
+ }
+#endif
+
+ // MirrorUV plane
+ for (y = 0; y < height; ++y) {
+ MirrorUVRow(src_uv, dst_uv, width);
+ src_uv += src_stride_uv;
+ dst_uv += dst_stride_uv;
+ }
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
@@ -1063,10 +2596,12 @@ int I420Mirror(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+
+ if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -1087,6 +2622,43 @@ int I420Mirror(const uint8_t* src_y,
return 0;
}
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8_t* src_argb,
@@ -1110,7 +2682,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -1139,11 +2711,19 @@ int ARGBMirror(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBMirrorRow = ARGBMirrorRow_MMI;
+#if defined(HAS_ARGBMIRRORROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_LASX;
}
}
#endif
@@ -1157,35 +2737,50 @@ int ARGBMirror(const uint8_t* src_argb,
return 0;
}
-// Get a blender that optimized for the CPU and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
+// RGB24 mirror.
LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
- void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
- uint8_t* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBBlendRow = ARGBBlendRow_SSSE3;
- return ARGBBlendRow;
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+ RGB24MirrorRow_C;
+ if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
}
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+#if defined(HAS_RGB24MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- ARGBBlendRow = ARGBBlendRow_NEON;
+ RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_NEON;
+ }
}
#endif
-#if defined(HAS_ARGBBLENDROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBBlendRow = ARGBBlendRow_MSA;
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+ }
}
#endif
-#if defined(HAS_ARGBBLENDROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBBlendRow = ARGBBlendRow_MMI;
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_rgb24 += dst_stride_rgb24;
}
-#endif
- return ARGBBlendRow;
+ return 0;
}
// Alpha Blend 2 ARGB images and store to destination.
@@ -1200,7 +2795,7 @@ int ARGBBlend(const uint8_t* src_argb0,
int height) {
int y;
void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
- uint8_t* dst_argb, int width) = GetARGBBlend();
+ uint8_t* dst_argb, int width) = ARGBBlendRow_C;
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1217,7 +2812,31 @@ int ARGBBlend(const uint8_t* src_argb0,
height = 1;
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
-
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBBlendRow = ARGBBlendRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBBlendRow = ARGBBlendRow_MSA;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBBlendRow = ARGBBlendRow_LSX;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBBlendRow = ARGBBlendRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
src_argb0 += src_stride_argb0;
@@ -1277,12 +2896,9 @@ int BlendPlane(const uint8_t* src_y0,
}
}
#endif
-#if defined(HAS_BLENDPLANEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BlendPlaneRow = BlendPlaneRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- BlendPlaneRow = BlendPlaneRow_MMI;
- }
+#if defined(HAS_BLENDPLANEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ BlendPlaneRow = BlendPlaneRow_RVV;
}
#endif
@@ -1329,6 +2945,7 @@ int I420Blend(const uint8_t* src_y0,
BlendPlaneRow_C;
void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+
if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
!alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -1361,12 +2978,9 @@ int I420Blend(const uint8_t* src_y0,
}
}
#endif
-#if defined(HAS_BLENDPLANEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BlendPlaneRow = BlendPlaneRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- BlendPlaneRow = BlendPlaneRow_MMI;
- }
+#if defined(HAS_BLENDPLANEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ BlendPlaneRow = BlendPlaneRow_RVV;
}
#endif
if (!IS_ALIGNED(width, 2)) {
@@ -1405,20 +3019,16 @@ int I420Blend(const uint8_t* src_y0,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
- if (IS_ALIGNED(width, 2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- ScaleRowDown2 = ScaleRowDown2Box_MMI;
- }
- }
+#if defined(HAS_SCALEROWDOWN2_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowDown2 = ScaleRowDown2Box_RVV;
}
#endif
// Row buffer for intermediate alpha pixels.
align_buffer_64(halfalpha, halfwidth);
+ if (!halfalpha)
+ return 1;
for (y = 0; y < height; y += 2) {
// last row of odd height image use 1 row of alpha instead of 2.
if (y == (height - 1)) {
@@ -1501,11 +3111,19 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBMULTIPLYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+#if defined(HAS_ARGBMULTIPLYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_LASX;
}
}
#endif
@@ -1549,12 +3167,12 @@ int ARGBAdd(const uint8_t* src_argb0,
height = 1;
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
-#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_SSE2;
}
#endif
-#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
@@ -1586,11 +3204,19 @@ int ARGBAdd(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBADDROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBAddRow = ARGBAddRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBAddRow = ARGBAddRow_MMI;
+#if defined(HAS_ARGBADDROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAddRow = ARGBAddRow_Any_LSX;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAddRow = ARGBAddRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAddRow = ARGBAddRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_LASX;
}
}
#endif
@@ -1666,11 +3292,19 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBSUBTRACTROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBSubtractRow = ARGBSubtractRow_MMI;
+#if defined(HAS_ARGBSUBTRACTROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_LSX;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSubtractRow = ARGBSubtractRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_LASX;
}
}
#endif
@@ -1684,177 +3318,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
return 0;
}
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*NV12ToRGB565Row)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
- if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
// Convert RAW to RGB24.
LIBYUV_API
@@ -1906,14 +3369,19 @@ int RAWToRGB24(const uint8_t* src_raw,
}
}
#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RAWToRGB24Row = RAWToRGB24Row_MMI;
+#if defined(HAS_RAWTORGB24ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGB24Row = RAWToRGB24Row_LSX;
}
}
#endif
+#if defined(HAS_RAWTORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToRGB24Row = RAWToRGB24Row_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1923,6 +3391,7 @@ int RAWToRGB24(const uint8_t* src_raw,
return 0;
}
+// TODO(fbarchard): Consider uint8_t value
LIBYUV_API
void SetPlane(uint8_t* dst_y,
int dst_stride_y,
@@ -1930,7 +3399,11 @@ void SetPlane(uint8_t* dst_y,
int height,
uint32_t value) {
int y;
- void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+ void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C;
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1968,10 +3441,18 @@ void SetPlane(uint8_t* dst_y,
SetRow = SetRow_MSA;
}
#endif
+#if defined(HAS_SETROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SetRow = SetRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_LSX;
+ }
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
- SetRow(dst_y, value, width);
+ SetRow(dst_y, (uint8_t)value, width);
dst_y += dst_stride_y;
}
}
@@ -1996,6 +3477,7 @@ int I420Rect(uint8_t* dst_y,
uint8_t* start_y = dst_y + y * dst_stride_y + x;
uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
@@ -2018,7 +3500,7 @@ int ARGBRect(uint8_t* dst_argb,
int height,
uint32_t value) {
int y;
- void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+ void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) =
ARGBSetRow_C;
if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
return -1;
@@ -2057,6 +3539,14 @@ int ARGBRect(uint8_t* dst_argb,
}
}
#endif
+#if defined(HAS_ARGBSETROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBSetRow = ARGBSetRow_Any_LSX;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_LSX;
+ }
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
@@ -2135,14 +3625,27 @@ int ARGBAttenuate(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -2243,9 +3746,14 @@ int ARGBGrayTo(const uint8_t* src_argb,
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
-#if defined(HAS_ARGBGRAYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBGrayRow = ARGBGrayRow_MMI;
+#if defined(HAS_ARGBGRAYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_LSX;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBGrayRow = ARGBGrayRow_LASX;
}
#endif
@@ -2293,9 +3801,14 @@ int ARGBGray(uint8_t* dst_argb,
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
-#if defined(HAS_ARGBGRAYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBGrayRow = ARGBGrayRow_MMI;
+#if defined(HAS_ARGBGRAYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_LSX;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBGrayRow = ARGBGrayRow_LASX;
}
#endif
@@ -2315,7 +3828,7 @@ int ARGBSepia(uint8_t* dst_argb,
int width,
int height) {
int y;
- void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+ void (*ARGBSepiaRow)(uint8_t* dst_argb, int width) = ARGBSepiaRow_C;
uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
@@ -2341,9 +3854,14 @@ int ARGBSepia(uint8_t* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_MSA;
}
#endif
-#if defined(HAS_ARGBSEPIAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBSepiaRow = ARGBSepiaRow_MMI;
+#if defined(HAS_ARGBSEPIAROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_LSX;
+ }
+#endif
+#if defined(HAS_ARGBSEPIAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBSepiaRow = ARGBSepiaRow_LASX;
}
#endif
@@ -2397,9 +3915,9 @@ int ARGBColorMatrix(const uint8_t* src_argb,
ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
}
#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+#if defined(HAS_ARGBCOLORMATRIXROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_LSX;
}
#endif
for (y = 0; y < height; ++y) {
@@ -2458,7 +3976,7 @@ int ARGBColorTable(uint8_t* dst_argb,
int width,
int height) {
int y;
- void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+ void (*ARGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
int width) = ARGBColorTableRow_C;
uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -2494,7 +4012,7 @@ int RGBColorTable(uint8_t* dst_argb,
int width,
int height) {
int y;
- void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+ void (*RGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
int width) = RGBColorTableRow_C;
uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -2539,7 +4057,7 @@ int ARGBQuantize(uint8_t* dst_argb,
int width,
int height) {
int y;
- void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
+ void (*ARGBQuantizeRow)(uint8_t* dst_argb, int scale, int interval_size,
int interval_offset, int width) = ARGBQuantizeRow_C;
uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
@@ -2567,6 +4085,11 @@ int ARGBQuantize(uint8_t* dst_argb,
ARGBQuantizeRow = ARGBQuantizeRow_MSA;
}
#endif
+#if defined(HAS_ARGBQUANTIZEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_LSX;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
dst += dst_stride_argb;
@@ -2596,11 +4119,6 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb,
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
}
#endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
- }
-#endif
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
for (y = 0; y < height; ++y) {
@@ -2651,7 +4169,7 @@ int ARGBBlur(const uint8_t* src_argb,
if (radius > (width / 2 - 1)) {
radius = width / 2 - 1;
}
- if (radius <= 0) {
+ if (radius <= 0 || height <= 1) {
return -1;
}
#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
@@ -2660,11 +4178,6 @@ int ARGBBlur(const uint8_t* src_argb,
CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
}
#endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
- }
-#endif
// Compute enough CumulativeSum for first row to be blurred. After this
// one row of CumulativeSum is updated at a time.
ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
@@ -2771,9 +4284,14 @@ int ARGBShade(const uint8_t* src_argb,
ARGBShadeRow = ARGBShadeRow_MSA;
}
#endif
-#if defined(HAS_ARGBSHADEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBShadeRow = ARGBShadeRow_MMI;
+#if defined(HAS_ARGBSHADEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_LSX;
+ }
+#endif
+#if defined(HAS_ARGBSHADEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_LASX;
}
#endif
@@ -2797,7 +4315,7 @@ int InterpolatePlane(const uint8_t* src0,
int height,
int interpolation) {
int y;
- void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -2847,14 +4365,19 @@ int InterpolatePlane(const uint8_t* src0,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -2865,6 +4388,86 @@ int InterpolatePlane(const uint8_t* src0,
return 0;
}
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane_16(const uint16_t* src0,
+ int src_stride0,
+ const uint16_t* src1,
+ int src_stride1,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation) {
+ int y;
+ void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst = dst + (height - 1) * dst_stride;
+ dst_stride = -dst_stride;
+ }
+ // Coalesce rows.
+ if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride0 = src_stride1 = dst_stride = 0;
+ }
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow_16 = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow_16 = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow_16 = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow_16 = InterpolateRow_16_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow_16 = InterpolateRow_16_LSX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ InterpolateRow_16(dst, src0, src1 - src0, width, interpolation);
+ src0 += src_stride0;
+ src1 += src_stride1;
+ dst += dst_stride;
+ }
+ return 0;
+}
+
// Interpolate 2 ARGB images by specified amount (0 to 255).
LIBYUV_API
int ARGBInterpolate(const uint8_t* src_argb0,
@@ -2906,10 +4509,12 @@ int I420Interpolate(const uint8_t* src0_y,
int interpolation) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
+
if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
!dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
+
InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
dst_stride_y, width, height, interpolation);
InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
@@ -2978,11 +4583,19 @@ int ARGBShuffle(const uint8_t* src_bgra,
}
}
#endif
-#if defined(HAS_ARGBSHUFFLEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBShuffleRow = ARGBShuffleRow_MMI;
+#if defined(HAS_ARGBSHUFFLEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_LASX;
}
}
#endif
@@ -2995,6 +4608,144 @@ int ARGBShuffle(const uint8_t* src_bgra,
return 0;
}
+// Shuffle AR64 channel order. e.g. AR64 to AB64.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ const uint8_t* shuffler,
+ int width,
+ int height) {
+ int y;
+ void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64,
+ const uint8_t* shuffler, int width) = AR64ShuffleRow_C;
+ if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+ src_stride_ar64 = -src_stride_ar64;
+ }
+ // Coalesce rows.
+ if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar64 = dst_stride_ar64 = 0;
+ }
+ // Assembly versions can be reused if it's implemented with shuffle.
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ AR64ShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AR64ShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ AR64ShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AR64ShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ AR64ShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler,
+ width * 2);
+ src_ar64 += src_stride_ar64;
+ dst_ar64 += dst_stride_ar64;
+ }
+ return 0;
+}
+
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int y;
+ void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+ const float* src3, const float* src4, float* dst,
+ int width) = GaussCol_F32_C;
+ void (*GaussRow_F32)(const float* src, float* dst, int width) =
+ GaussRow_F32_C;
+ if (!src || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussCol_F32 = GaussCol_F32_NEON;
+ }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussRow_F32 = GaussRow_F32_NEON;
+ }
+#endif
+ {
+ // 2 pixels on each side, but aligned out to 16 bytes.
+ align_buffer_64(rowbuf, (4 + width + 4) * 4);
+ if (!rowbuf)
+ return 1;
+ memset(rowbuf, 0, 16);
+ memset(rowbuf + (4 + width) * 4, 0, 16);
+ float* row = (float*)(rowbuf + 16);
+ const float* src0 = src;
+ const float* src1 = src;
+ const float* src2 = src;
+ const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+ const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+ for (y = 0; y < height; ++y) {
+ GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+ // Extrude edge by 2 floats
+ row[-2] = row[-1] = row[0];
+ row[width + 1] = row[width] = row[width - 1];
+
+ GaussRow_F32(row - 2, dst, width);
+
+ src0 = src1;
+ src1 = src2;
+ src2 = src3;
+ src3 = src4;
+ if ((y + 2) < (height - 1)) {
+ src4 += src_stride;
+ }
+ dst += dst_stride;
+ }
+ free_aligned_buffer_64(rowbuf);
+ }
+ return 0;
+}
+
// Sobel ARGB effect.
static int ARGBSobelize(const uint8_t* src_argb,
int src_stride_argb,
@@ -3044,7 +4795,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -3057,14 +4808,27 @@ static int ARGBSobelize(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_LSX;
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYJRow = ARGBToYJRow_RVV;
+ }
+#endif
#if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -3081,11 +4845,6 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelYRow = SobelYRow_MSA;
}
#endif
-#if defined(HAS_SOBELYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelYRow = SobelYRow_MMI;
- }
-#endif
#if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2;
@@ -3101,23 +4860,20 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelXRow = SobelXRow_MSA;
}
#endif
-#if defined(HAS_SOBELXROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelXRow = SobelXRow_MMI;
- }
-#endif
{
// 3 rows with edges before/after.
- const int kRowSize = (width + kEdge + 31) & ~31;
- align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+ const int row_size = (width + kEdge + 31) & ~31;
+ align_buffer_64(rows, row_size * 2 + (kEdge + row_size * 3 + kEdge));
uint8_t* row_sobelx = rows;
- uint8_t* row_sobely = rows + kRowSize;
- uint8_t* row_y = rows + kRowSize * 2;
+ uint8_t* row_sobely = rows + row_size;
+ uint8_t* row_y = rows + row_size * 2;
// Convert first row.
uint8_t* row_y0 = row_y + kEdge;
- uint8_t* row_y1 = row_y0 + kRowSize;
- uint8_t* row_y2 = row_y1 + kRowSize;
+ uint8_t* row_y1 = row_y0 + row_size;
+ uint8_t* row_y2 = row_y1 + row_size;
+ if (!rows)
+ return 1;
ARGBToYJRow(src_argb, row_y0, width);
row_y0[-1] = row_y0[0];
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
@@ -3188,11 +4944,11 @@ int ARGBSobel(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelRow = SobelRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SobelRow = SobelRow_MMI;
+#if defined(HAS_SOBELROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SobelRow = SobelRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_LSX;
}
}
#endif
@@ -3234,11 +4990,11 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELTOPLANEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SobelToPlaneRow = SobelToPlaneRow_MMI;
+#if defined(HAS_SOBELTOPLANEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SobelToPlaneRow = SobelToPlaneRow_LSX;
}
}
#endif
@@ -3281,11 +5037,11 @@ int ARGBSobelXY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELXYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelXYRow = SobelXYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SobelXYRow = SobelXYRow_MMI;
+#if defined(HAS_SOBELXYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SobelXYRow = SobelXYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_LSX;
}
}
#endif
@@ -3412,6 +5168,14 @@ int HalfFloatPlane(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_HALFFLOATROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ HalfFloatRow = HalfFloatRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ HalfFloatRow = HalfFloatRow_LSX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
@@ -3526,14 +5290,6 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBCOPYALPHAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
ARGBCopyAlphaRow(src_argb, dst_argb, width);
@@ -3592,10 +5348,15 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
: ARGBExtractAlphaRow_Any_MSA;
}
#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
- : ARGBExtractAlphaRow_Any_MMI;
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+ : ARGBExtractAlphaRow_Any_LSX;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
}
#endif
@@ -3649,12 +5410,9 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
- }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV;
}
#endif
@@ -3666,9 +5424,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
return 0;
}
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
LIBYUV_API
int YUY2ToNV12(const uint8_t* src_yuy2,
int src_stride_yuy2,
@@ -3679,124 +5434,105 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
int width,
int height) {
int y;
- int halfwidth = (width + 1) >> 1;
- void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
- int width) = SplitUVRow_C;
- void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) = InterpolateRow_C;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+ YUY2ToYRow_C;
+ void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
+ uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
-#if defined(HAS_SPLITUVROW_SSE2)
+#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- SplitUVRow = SplitUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- SplitUVRow = SplitUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
-#if defined(HAS_SPLITUVROW_AVX2)
+#if defined(HAS_YUY2TOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- SplitUVRow = SplitUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- SplitUVRow = SplitUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
}
}
#endif
-#if defined(HAS_SPLITUVROW_NEON)
+#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- SplitUVRow = SplitUVRow_Any_NEON;
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
- SplitUVRow = SplitUVRow_NEON;
+ YUY2ToYRow = YUY2ToYRow_NEON;
}
}
#endif
-#if defined(HAS_SPLITUVROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
- SplitUVRow = SplitUVRow_Any_MSA;
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
- SplitUVRow = SplitUVRow_MSA;
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitUVRow = SplitUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SplitUVRow = SplitUVRow_MMI;
+ YUY2ToYRow = YUY2ToYRow_MSA;
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
+ YUY2ToYRow = YUY2ToYRow_LSX;
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_AVX2;
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_LASX;
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_NEON;
+
+#if defined(HAS_YUY2TONVUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- InterpolateRow = InterpolateRow_NEON;
+ YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2;
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- InterpolateRow = InterpolateRow_Any_MSA;
+#if defined(HAS_YUY2TONVUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- InterpolateRow = InterpolateRow_MSA;
+ YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2;
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_YUY2TONVUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToNVUVRow = YUY2ToNVUVRow_NEON;
}
}
#endif
- {
- int awidth = halfwidth * 2;
- // row of y and 2 rows of uv
- align_buffer_64(rows, awidth * 3);
-
- for (y = 0; y < height - 1; y += 2) {
- // Split Y from UV.
- SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
- memcpy(dst_y, rows, width);
- SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
- memcpy(dst_y + dst_stride_y, rows, width);
- InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
- src_yuy2 += src_stride_yuy2 * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
- }
- if (height & 1) {
- // Split Y from UV.
- SplitUVRow(src_yuy2, rows, dst_uv, awidth);
- memcpy(dst_y, rows, width);
- }
- free_aligned_buffer_64(rows);
+ for (y = 0; y < height - 1; y += 2) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width);
}
return 0;
}
@@ -3814,12 +5550,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
- void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
+
if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -3858,14 +5596,20 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitUVRow = SplitUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SplitUVRow = SplitUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_LSX;
}
}
#endif
+#if defined(HAS_SPLITUVROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ SplitUVRow = SplitUVRow_RVV;
+ }
+#endif
+
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -3898,19 +5642,26 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
{
int awidth = halfwidth * 2;
// row of y and 2 rows of uv
align_buffer_64(rows, awidth * 3);
+ if (!rows)
+ return 1;
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
@@ -3933,6 +5684,57 @@ int UYVYToNV12(const uint8_t* src_uyvy,
return 0;
}
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_NEON;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ HalfMergeUVRow = HalfMergeUVRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Merge a row of U and V into a row of UV.
+ HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/rotate.cc b/source/rotate.cc
new file mode 100644
index 00000000..3f8332c3
--- /dev/null
+++ b/source/rotate.cc
@@ -0,0 +1,1231 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i = height;
+#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
+ void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width) = TransposeWx16_C;
+#else
+ void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width) = TransposeWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeWx8 = TransposeWx8_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_NEON;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx8 = TransposeWx8_Fast_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeWx16 = TransposeWx16_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_MSA;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX16_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ TransposeWx16 = TransposeWx16_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_LSX;
+ }
+ }
+#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
+ // Work across the source in 16x16 tiles
+ while (i >= 16) {
+ TransposeWx16(src, src_stride, dst, dst_stride, width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst += 16; // Move over 16 columns.
+ i -= 16;
+ }
+#else
+ // Work across the source in 8x8 tiles
+ while (i >= 8) {
+ TransposeWx8(src, src_stride, dst, dst_stride, width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
+ }
+#endif
+
+ if (i > 0) {
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+ }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 90 is a transpose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 270 is a transpose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Swap top and bottom row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width);
+ assert(row);
+ if (!row)
+ return;
+ const uint8_t* src_bot = src + src_stride * (height - 1);
+ uint8_t* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MirrorRow = MirrorRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorRow = MirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ CopyRow = CopyRow_RVV;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ CopyRow(src, row, width); // Copy top row into buffer
+ MirrorRow(src_bot, dst, width); // Mirror bottom row into top row
+ MirrorRow(row, dst_bot, width); // Mirror buffer into bottom row
+ src += src_stride;
+ dst += dst_stride;
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void SplitTransposeUV(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i = height;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx16_C;
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+ void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx16_C;
+#else
+ void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_LSX;
+ }
+ }
+#else
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeUVWx8 = TransposeUVWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
+ }
+ }
+#endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ // Work through the source in 8x8 tiles.
+ while (i >= 16) {
+ TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst_a += 16; // Move over 8 columns.
+ dst_b += 16; // Move over 8 columns.
+ i -= 16;
+ }
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+ // Work through the source in 8x8 tiles.
+ while (i >= 16) {
+ TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst_a += 16; // Move over 8 columns.
+ dst_b += 16; // Move over 8 columns.
+ i -= 16;
+ }
+#else
+ // Work through the source in 8x8 tiles.
+ while (i >= 8) {
+ TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
+ i -= 8;
+ }
+#endif
+
+ if (i > 0) {
+ TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, i);
+ }
+}
+
+LIBYUV_API
+void SplitRotateUV90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+
+ SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, height);
+}
+
+LIBYUV_API
+void SplitRotateUV270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ dst_a += dst_stride_a * (width - 1);
+ dst_b += dst_stride_b * (width - 1);
+ dst_stride_a = -dst_stride_a;
+ dst_stride_b = -dst_stride_b;
+
+ SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void SplitRotateUV180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i;
+ void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_MSA;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_LSX;
+ }
+#endif
+
+ dst_a += dst_stride_a * (height - 1);
+ dst_b += dst_stride_b * (height - 1);
+
+ for (i = 0; i < height; ++i) {
+ MirrorSplitUVRow(src, dst_a, dst_b, width);
+ src += src_stride;
+ dst_a -= dst_stride_a;
+ dst_b -= dst_stride_b;
+ }
+}
+
+// Rotate UV and split into planar.
+// width and height expected to be half size for NV12
+LIBYUV_API
+int SplitRotateUV(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ case kRotate90:
+ SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ case kRotate270:
+ SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ case kRotate180:
+ SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+static void TransposePlane_16(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i = height;
+ // Work across the source in 8x8 tiles
+ while (i >= 8) {
+ TransposeWx8_16_C(src, src_stride, dst, dst_stride, width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
+ }
+
+ if (i > 0) {
+ TransposeWxH_16_C(src, src_stride, dst, dst_stride, width, i);
+ }
+}
+
+static void RotatePlane90_16(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 90 is a transpose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane270_16(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 270 is a transpose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane180_16(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ const uint16_t* src_bot = src + src_stride * (height - 1);
+ uint16_t* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+
+ // Swap top and bottom row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width * 2);
+ uint16_t* row_tmp = (uint16_t*)row;
+ assert(row);
+ if (!row)
+ return;
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ CopyRow_16_C(src, row_tmp, width); // Copy top row into buffer
+ MirrorRow_16_C(src_bot, dst, width); // Mirror bottom row into top row
+ MirrorRow_16_C(row_tmp, dst_bot, width); // Mirror buffer into bottom row
+ src += src_stride;
+ dst += dst_stride;
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane_16(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90_16(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270_16(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180_16(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+// I422 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
+LIBYUV_API
+int I422Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ int r;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // Copy frame
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+
+ // Note on temporary Y plane for UV.
+ // Rotation of UV first fits within the Y destination plane rows.
+ // Y plane is width x height
+ // Y plane rotated is height x width
+ // UV plane is (width / 2) x height
+ // UV plane rotated is height x (width / 2)
+ // UV plane rotated+scaled is (height / 2) x width.
+ // UV plane rotated is a temporary that fits within the Y plane rotated.
+
+ case kRotate90:
+ RotatePlane90(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u,
+ dst_stride_u, halfheight, width, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane90(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v,
+ dst_stride_v, halfheight, width, kFilterLinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u,
+ dst_stride_u, halfheight, width, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane270(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v,
+ dst_stride_v, halfheight, width, kFilterLinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ height);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+ !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+static void SplitPixels(const uint8_t* src_u,
+ int src_pixel_stride_uv,
+ uint8_t* dst_u,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst_u = *src_u;
+ ++dst_u;
+ src_u += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to I420 with Rotate
+LIBYUV_API
+int Android420ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode rotation) {
+ int y;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ RotatePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+ rotation);
+ }
+
+ // Copy UV planes - I420
+ if (src_pixel_stride_uv == 1) {
+ RotatePlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight,
+ rotation);
+ RotatePlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight,
+ rotation);
+ return 0;
+ }
+ // Split UV planes - NV21
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ SplitRotateUV(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ halfwidth, halfheight, rotation);
+ return 0;
+ }
+ // Split UV planes - NV12
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+ SplitRotateUV(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight, rotation);
+ return 0;
+ }
+
+ if (rotation == 0) {
+ for (y = 0; y < halfheight; ++y) {
+ SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+ SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+ }
+ // unsupported type and/or rotation.
+ return -1;
+}
+
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v || dst_stride_y < 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return I010Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height);
+ case kRotate90:
+ RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+ height);
+ RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+ height);
+ RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+// I210 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ int r;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // Copy frame
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+
+ // Note on temporary Y plane for UV.
+ // Rotation of UV first fits within the Y destination plane rows.
+ // Y plane is width x height
+ // Y plane rotated is height x width
+ // UV plane is (width / 2) x height
+ // UV plane rotated is height x (width / 2)
+ // UV plane rotated+scaled is (height / 2) x width.
+ // UV plane rotated is a temporary that fits within the Y plane rotated.
+
+ case kRotate90:
+ RotatePlane90_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u,
+ dst_stride_u, halfheight, width, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane90_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v,
+ dst_stride_v, halfheight, width, kFilterLinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u,
+ dst_stride_u, halfheight, width, kFilterBilinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane270_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+ height);
+ r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v,
+ dst_stride_v, halfheight, width, kFilterLinear);
+ if (r != 0) {
+ return r;
+ }
+ RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+ height);
+ return 0;
+ case kRotate180:
+ RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+ height);
+ RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ height);
+ RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v || dst_stride_y < 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+ height);
+ RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+ height);
+ RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+ height);
+ return 0;
+ case kRotate180:
+ RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+ height);
+ RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+ height);
+ RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+ height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_any.cc b/source/rotate_any.cc
index b3baf084..88ca7876 100644
--- a/files/source/rotate_any.cc
+++ b/source/rotate_any.cc
@@ -35,15 +35,15 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
#endif
-#ifdef HAS_TRANSPOSEWX8_MMI
-TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
-#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#endif
#ifdef HAS_TRANSPOSEWX16_MSA
TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
#endif
+#ifdef HAS_TRANSPOSEWX16_LSX
+TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, 15)
+#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
@@ -65,12 +65,12 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
-#ifdef HAS_TRANSPOSEUVWX8_MMI
-TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
-#endif
#ifdef HAS_TRANSPOSEUVWX16_MSA
TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
#endif
+#ifdef HAS_TRANSPOSEUVWX16_LSX
+TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7)
+#endif
#undef TUVANY
#ifdef __cplusplus
diff --git a/files/source/rotate_argb.cc b/source/rotate_argb.cc
index a93fd55f..d55fac4f 100644
--- a/files/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -8,11 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
#include "libyuv/convert.h"
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
#include "libyuv/row.h"
#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
@@ -21,17 +22,21 @@ namespace libyuv {
extern "C" {
#endif
-static void ARGBTranspose(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int i;
int src_pixel_step = src_stride_argb >> 2;
void (*ScaleARGBRowDownEven)(
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+ // Check stride is a multiple of 4.
+ if (src_stride_argb & 3) {
+ return -1;
+ }
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -56,60 +61,65 @@ static void ARGBTranspose(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX;
if (IS_ALIGNED(height, 4)) { // Width of dest.
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX;
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+ }
+#endif
for (i = 0; i < width; ++i) { // column of source to row of dest.
ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
dst_argb += dst_stride_argb;
src_argb += 4;
}
+ return 0;
}
-void ARGBRotate90(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src_argb += src_stride_argb * (height - 1);
src_stride_argb = -src_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate270(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst_argb += dst_stride_argb * (width - 1);
dst_stride_argb = -dst_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate180(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
- align_buffer_64(row, width * 4);
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
int half_height = (height + 1) >> 1;
@@ -118,10 +128,13 @@ void ARGBRotate180(const uint8_t* src_argb,
ARGBMirrorRow_C;
void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
CopyRow_C;
+ align_buffer_64(row, width * 4);
+ if (!row)
+ return 1;
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -150,11 +163,19 @@ void ARGBRotate180(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBMirrorRow = ARGBMirrorRow_MMI;
+#if defined(HAS_ARGBMIRRORROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_LASX;
}
}
#endif
@@ -178,6 +199,11 @@ void ARGBRotate180(const uint8_t* src_argb,
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
+#if defined(HAS_COPYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ CopyRow = CopyRow_RVV;
+ }
+#endif
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
@@ -190,6 +216,7 @@ void ARGBRotate180(const uint8_t* src_argb,
dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
+ return 0;
}
LIBYUV_API
@@ -217,17 +244,14 @@ int ARGBRotate(const uint8_t* src_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
default:
break;
}
diff --git a/source/rotate_common.cc b/source/rotate_common.cc
new file mode 100644
index 00000000..e72608e9
--- /dev/null
+++ b/source/rotate_common.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+void TransposeUVWx8_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_a[0] = src[0 * src_stride + 0];
+ dst_b[0] = src[0 * src_stride + 1];
+ dst_a[1] = src[1 * src_stride + 0];
+ dst_b[1] = src[1 * src_stride + 1];
+ dst_a[2] = src[2 * src_stride + 0];
+ dst_b[2] = src[2 * src_stride + 1];
+ dst_a[3] = src[3 * src_stride + 0];
+ dst_b[3] = src[3 * src_stride + 1];
+ dst_a[4] = src[4 * src_stride + 0];
+ dst_b[4] = src[4 * src_stride + 1];
+ dst_a[5] = src[5 * src_stride + 0];
+ dst_b[5] = src[5 * src_stride + 1];
+ dst_a[6] = src[6 * src_stride + 0];
+ dst_b[6] = src[6 * src_stride + 1];
+ dst_a[7] = src[7 * src_stride + 0];
+ dst_b[7] = src[7 * src_stride + 1];
+ src += 2;
+ dst_a += dst_stride_a;
+ dst_b += dst_stride_b;
+ }
+}
+
+void TransposeWxH_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+void TransposeUVWxH_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width * 2; i += 2) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)];
+ dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1];
+ }
+ }
+}
+
+void TransposeWx8_16_C(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+void TransposeWxH_16_C(const uint16_t* src,
+ int src_stride,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ const uint8_t* src1 = src + src_stride;
+ const uint8_t* src2 = src1 + src_stride;
+ const uint8_t* src3 = src2 + src_stride;
+ uint8_t* dst1 = dst + dst_stride;
+ uint8_t* dst2 = dst1 + dst_stride;
+ uint8_t* dst3 = dst2 + dst_stride;
+ int i;
+ for (i = 0; i < width; i += 4) {
+ uint32_t p00 = ((uint32_t*)(src))[0];
+ uint32_t p10 = ((uint32_t*)(src))[1];
+ uint32_t p20 = ((uint32_t*)(src))[2];
+ uint32_t p30 = ((uint32_t*)(src))[3];
+ uint32_t p01 = ((uint32_t*)(src1))[0];
+ uint32_t p11 = ((uint32_t*)(src1))[1];
+ uint32_t p21 = ((uint32_t*)(src1))[2];
+ uint32_t p31 = ((uint32_t*)(src1))[3];
+ uint32_t p02 = ((uint32_t*)(src2))[0];
+ uint32_t p12 = ((uint32_t*)(src2))[1];
+ uint32_t p22 = ((uint32_t*)(src2))[2];
+ uint32_t p32 = ((uint32_t*)(src2))[3];
+ uint32_t p03 = ((uint32_t*)(src3))[0];
+ uint32_t p13 = ((uint32_t*)(src3))[1];
+ uint32_t p23 = ((uint32_t*)(src3))[2];
+ uint32_t p33 = ((uint32_t*)(src3))[3];
+ ((uint32_t*)(dst))[0] = p00;
+ ((uint32_t*)(dst))[1] = p01;
+ ((uint32_t*)(dst))[2] = p02;
+ ((uint32_t*)(dst))[3] = p03;
+ ((uint32_t*)(dst1))[0] = p10;
+ ((uint32_t*)(dst1))[1] = p11;
+ ((uint32_t*)(dst1))[2] = p12;
+ ((uint32_t*)(dst1))[3] = p13;
+ ((uint32_t*)(dst2))[0] = p20;
+ ((uint32_t*)(dst2))[1] = p21;
+ ((uint32_t*)(dst2))[2] = p22;
+ ((uint32_t*)(dst2))[3] = p23;
+ ((uint32_t*)(dst3))[0] = p30;
+ ((uint32_t*)(dst3))[1] = p31;
+ ((uint32_t*)(dst3))[2] = p32;
+ ((uint32_t*)(dst3))[3] = p33;
+ src += src_stride * 4; // advance 4 rows
+ src1 += src_stride * 4;
+ src2 += src_stride * 4;
+ src3 += src_stride * 4;
+ dst += 4 * 4; // advance 4 columns
+ dst1 += 4 * 4;
+ dst2 += 4 * 4;
+ dst3 += 4 * 4;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc
new file mode 100644
index 00000000..fd5eee05
--- /dev/null
+++ b/source/rotate_gcc.cc
@@ -0,0 +1,503 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "xmm15");
+}
+#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8. 64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9");
+}
+#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_SSE2)
+// 4 values, little endian view
+// a b c d
+// e f g h
+// i j k l
+// m n o p
+
+// transpose 2x2
+// a e b f from row 0, 1
+// i m j n from row 2, 3
+// c g d h from row 0, 1
+// k o l p from row 2, 3
+
+// transpose 4x4
+// a e i m from row 0, 1
+// b f j n from row 0, 1
+// c g k o from row 2, 3
+// d h l p from row 2, 3
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Main loop transpose 4x4. Read a column, write a row.
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // a b c d
+ "movdqu (%0,%3),%%xmm1 \n" // e f g h
+ "lea (%0,%3,2),%0 \n" // src += stride * 2
+ "movdqu (%0),%%xmm2 \n" // i j k l
+ "movdqu (%0,%3),%%xmm3 \n" // m n o p
+ "lea (%0,%3,2),%0 \n" // src += stride * 2
+
+ // Transpose 2x2
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1
+ "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3
+ "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1
+ "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3
+
+ // Transpose 4x4
+ "movdqa %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1
+ "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1
+ "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3
+ "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3
+
+ "movdqu %%xmm0,(%1) \n"
+ "lea 16(%1,%4),%1 \n" // dst += stride + 16
+ "movdqu %%xmm1,-16(%1) \n"
+ "movdqu %%xmm2,-16(%1,%4) \n"
+ "movdqu %%xmm3,-16(%1,%4,2) \n"
+ "sub %4,%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+rm"(width) // %2
+ : "r"((ptrdiff_t)(src_stride)), // %3
+ "r"((ptrdiff_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // defined(HAS_TRANSPOSE4X4_32_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_AVX2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Main loop transpose 2 blocks of 4x4. Read a column, write a row.
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // a b c d
+ "vmovdqu (%0,%3),%%xmm1 \n" // e f g h
+ "lea (%0,%3,2),%0 \n" // src += stride * 2
+ "vmovdqu (%0),%%xmm2 \n" // i j k l
+ "vmovdqu (%0,%3),%%xmm3 \n" // m n o p
+ "lea (%0,%3,2),%0 \n" // src += stride * 2
+
+ "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d
+ "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h
+ "lea (%0,%3,2),%0 \n" // src += stride * 2
+ "vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l
+ "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p
+ "lea (%0,%3,2),%0 \n" // src += stride * 2
+
+ // Transpose 2x2
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1
+ "vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1
+ "vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3
+
+ // Transpose 4x4
+ "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1
+ "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1
+ "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3
+ "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3
+
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 32(%1,%4),%1 \n" // dst += stride + 32
+ "vmovdqu %%ymm1,-32(%1) \n"
+ "vmovdqu %%ymm2,-32(%1,%4) \n"
+ "vmovdqu %%ymm3,-32(%1,%4,2) \n"
+ "sub %4,%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+rm"(width) // %2
+ : "r"((ptrdiff_t)(src_stride)), // %3
+ "r"((ptrdiff_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/rotate_lsx.cc b/source/rotate_lsx.cc
new file mode 100644
index 00000000..94a2b91c
--- /dev/null
+++ b/source/rotate_lsx.cc
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
+ _stride3, _stride4) \
+ { \
+ __lsx_vst(_dst0, _dst, 0); \
+ __lsx_vstx(_dst1, _dst, _stride); \
+ __lsx_vstx(_dst2, _dst, _stride2); \
+ __lsx_vstx(_dst3, _dst, _stride3); \
+ _dst += _stride4; \
+ }
+
+#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
+ { \
+ __lsx_vst(_dst0, _dst, 0); \
+ __lsx_vstx(_dst1, _dst, _stride); \
+ _dst += _stride2; \
+ }
+
+void TransposeWx16_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+ TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+ width);
+}
+
+void TransposeUVWx16_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+ dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ int x;
+ int len = width / 16;
+ uint8_t* s;
+ int src_stride2 = src_stride << 1;
+ int src_stride3 = src_stride + src_stride2;
+ int src_stride4 = src_stride2 << 1;
+ int dst_stride2 = dst_stride << 1;
+ int dst_stride3 = dst_stride + dst_stride2;
+ int dst_stride4 = dst_stride2 << 1;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < len; x++) {
+ s = (uint8_t*)src;
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ res8 = __lsx_vilvl_w(reg4, reg0);
+ res9 = __lsx_vilvh_w(reg4, reg0);
+ ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ res8 = __lsx_vilvl_w(reg5, reg1);
+ res9 = __lsx_vilvh_w(reg5, reg1);
+ ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ res8 = __lsx_vilvl_w(reg6, reg2);
+ res9 = __lsx_vilvh_w(reg6, reg2);
+ ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ res8 = __lsx_vilvl_w(reg7, reg3);
+ res9 = __lsx_vilvh_w(reg7, reg3);
+ ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ src += 16;
+ }
+}
+
+void TransposeUVWx16_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ int x;
+ int len = width / 8;
+ uint8_t* s;
+ int src_stride2 = src_stride << 1;
+ int src_stride3 = src_stride + src_stride2;
+ int src_stride4 = src_stride2 << 1;
+ int dst_stride_a2 = dst_stride_a << 1;
+ int dst_stride_b2 = dst_stride_b << 1;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < len; x++) {
+ s = (uint8_t*)src;
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ res8 = __lsx_vilvl_w(reg4, reg0);
+ res9 = __lsx_vilvh_w(reg4, reg0);
+ ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ res8 = __lsx_vilvl_w(reg5, reg1);
+ res9 = __lsx_vilvh_w(reg5, reg1);
+ ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ res8 = __lsx_vilvl_w(reg6, reg2);
+ res9 = __lsx_vilvh_w(reg6, reg2);
+ ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ res8 = __lsx_vilvl_w(reg7, reg3);
+ res9 = __lsx_vilvh_w(reg7, reg3);
+ ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ src += 16;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/rotate_msa.cc b/source/rotate_msa.cc
index 99bdca65..99bdca65 100644
--- a/files/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
diff --git a/files/source/rotate_neon.cc b/source/rotate_neon.cc
index fdc0dd47..569a7318 100644
--- a/files/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld1.8 {d0}, [%0], %2 \n"
- "vld1.8 {d1}, [%0], %2 \n"
- "vld1.8 {d2}, [%0], %2 \n"
- "vld1.8 {d3}, [%0], %2 \n"
- "vld1.8 {d4}, [%0], %2 \n"
- "vld1.8 {d5}, [%0], %2 \n"
- "vld1.8 {d6}, [%0], %2 \n"
- "vld1.8 {d7}, [%0] \n"
-
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
-
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
-
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d1}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d3}, [%0], %4 \n"
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d5}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d7}, [%0], %4 \n"
- "vst1.8 {d6}, [%0] \n"
-
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld2.8 {d0, d1}, [%0], %2 \n"
- "vld2.8 {d2, d3}, [%0], %2 \n"
- "vld2.8 {d4, d5}, [%0], %2 \n"
- "vld2.8 {d6, d7}, [%0], %2 \n"
- "vld2.8 {d16, d17}, [%0], %2 \n"
- "vld2.8 {d18, d19}, [%0], %2 \n"
- "vld2.8 {d20, d21}, [%0], %2 \n"
- "vld2.8 {d22, d23}, [%0] \n"
-
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
-
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
-
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d6}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d18}, [%0], %4 \n"
- "vst1.8 {d16}, [%0], %4 \n"
- "vst1.8 {d22}, [%0], %4 \n"
- "vst1.8 {d20}, [%0] \n"
-
- "mov %0, %5 \n"
-
- "vst1.8 {d3}, [%0], %6 \n"
- "vst1.8 {d1}, [%0], %6 \n"
- "vst1.8 {d7}, [%0], %6 \n"
- "vst1.8 {d5}, [%0], %6 \n"
- "vst1.8 {d19}, [%0], %6 \n"
- "vst1.8 {d17}, [%0], %6 \n"
- "vst1.8 {d23}, [%0], %6 \n"
- "vst1.8 {d21}, [%0] \n"
-
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
@@ -408,6 +410,46 @@ void TransposeUVWx8_NEON(const uint8_t* src,
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ const uint8_t* src1 = src + src_stride;
+ const uint8_t* src2 = src1 + src_stride;
+ const uint8_t* src3 = src2 + src_stride;
+ uint8_t* dst1 = dst + dst_stride;
+ uint8_t* dst2 = dst1 + dst_stride;
+ uint8_t* dst3 = dst2 + dst_stride;
+ asm volatile(
+ // Main loop transpose 4x4. Read a column, write a row.
+ "1: \n"
+ "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
+ "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
+ "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
+ "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
+ "subs %8, %8, #4 \n" // w -= 4
+ "vst1.8 {q0}, [%4]! \n"
+ "vst1.8 {q1}, [%5]! \n"
+ "vst1.8 {q2}, [%6]! \n"
+ "vst1.8 {q3}, [%7]! \n"
+ "bgt 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(dst1), // %5
+ "+r"(dst2), // %6
+ "+r"(dst3), // %7
+ "+r"(width) // %8
+ : "r"((ptrdiff_t)(src_stride * 4)) // %9
+ : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
diff --git a/files/source/rotate_neon64.cc b/source/rotate_neon64.cc
index f469baac..95047fa7 100644
--- a/files/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w3, %w3, #8 \n"
+ "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
"mov %0, %1 \n"
- "ld1 {v0.8b}, [%0], %5 \n"
- "ld1 {v1.8b}, [%0], %5 \n"
- "ld1 {v2.8b}, [%0], %5 \n"
- "ld1 {v3.8b}, [%0], %5 \n"
- "ld1 {v4.8b}, [%0], %5 \n"
- "ld1 {v5.8b}, [%0], %5 \n"
- "ld1 {v6.8b}, [%0], %5 \n"
- "ld1 {v7.8b}, [%0] \n"
-
- "trn2 v16.8b, v0.8b, v1.8b \n"
- "trn1 v17.8b, v0.8b, v1.8b \n"
- "trn2 v18.8b, v2.8b, v3.8b \n"
- "trn1 v19.8b, v2.8b, v3.8b \n"
- "trn2 v20.8b, v4.8b, v5.8b \n"
- "trn1 v21.8b, v4.8b, v5.8b \n"
- "trn2 v22.8b, v6.8b, v7.8b \n"
- "trn1 v23.8b, v6.8b, v7.8b \n"
-
- "trn2 v3.4h, v17.4h, v19.4h \n"
- "trn1 v1.4h, v17.4h, v19.4h \n"
- "trn2 v2.4h, v16.4h, v18.4h \n"
- "trn1 v0.4h, v16.4h, v18.4h \n"
- "trn2 v7.4h, v21.4h, v23.4h \n"
- "trn1 v5.4h, v21.4h, v23.4h \n"
- "trn2 v6.4h, v20.4h, v22.4h \n"
- "trn1 v4.4h, v20.4h, v22.4h \n"
-
- "trn2 v21.2s, v1.2s, v5.2s \n"
- "trn1 v17.2s, v1.2s, v5.2s \n"
- "trn2 v20.2s, v0.2s, v4.2s \n"
- "trn1 v16.2s, v0.2s, v4.2s \n"
- "trn2 v23.2s, v3.2s, v7.2s \n"
- "trn1 v19.2s, v3.2s, v7.2s \n"
- "trn2 v22.2s, v2.2s, v6.2s \n"
- "trn1 v18.2s, v2.2s, v6.2s \n"
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
- "st1 {v17.8b}, [%0], %6 \n"
- "st1 {v16.8b}, [%0], %6 \n"
- "st1 {v19.8b}, [%0], %6 \n"
- "st1 {v18.8b}, [%0], %6 \n"
- "st1 {v21.8b}, [%0], %6 \n"
- "st1 {v20.8b}, [%0], %6 \n"
- "st1 {v23.8b}, [%0], %6 \n"
- "st1 {v22.8b}, [%0] \n"
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
@@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w3, %w3, #8 \n"
- "b.eq 4f \n"
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w3, #2 \n"
- "b.lt 3f \n"
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- "cmp %w3, #4 \n"
- "b.lt 2f \n"
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
// 4x8 block
- "mov %0, %1 \n"
- "ld1 {v0.s}[0], [%0], %5 \n"
- "ld1 {v0.s}[1], [%0], %5 \n"
- "ld1 {v0.s}[2], [%0], %5 \n"
- "ld1 {v0.s}[3], [%0], %5 \n"
- "ld1 {v1.s}[0], [%0], %5 \n"
- "ld1 {v1.s}[1], [%0], %5 \n"
- "ld1 {v1.s}[2], [%0], %5 \n"
- "ld1 {v1.s}[3], [%0] \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- "ld1 {v2.16b}, [%4] \n"
+ "ld1 {v2.16b}, [%4] \n"
- "tbl v3.16b, {v0.16b}, v2.16b \n"
- "tbl v0.16b, {v1.16b}, v2.16b \n"
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
@@ -185,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src,
"4: \n"
- : "=&r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(dst), // %2
- "+r"(width) // %3
- : "r"(&kVTbl4x4Transpose), // %4
- "r"(static_cast<ptrdiff_t>(src_stride)), // %5
- "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst), // %2
+ "+r"(width) // %3
+ : "r"(&kVTbl4x4Transpose), // %4
+ "r"((ptrdiff_t)src_stride), // %5
+ "r"((ptrdiff_t)dst_stride) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
@@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w4, %w4, #8 \n"
+ "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "ld1 {v0.16b}, [%0], %5 \n"
- "ld1 {v1.16b}, [%0], %5 \n"
- "ld1 {v2.16b}, [%0], %5 \n"
- "ld1 {v3.16b}, [%0], %5 \n"
- "ld1 {v4.16b}, [%0], %5 \n"
- "ld1 {v5.16b}, [%0], %5 \n"
- "ld1 {v6.16b}, [%0], %5 \n"
- "ld1 {v7.16b}, [%0] \n"
-
- "trn1 v16.16b, v0.16b, v1.16b \n"
- "trn2 v17.16b, v0.16b, v1.16b \n"
- "trn1 v18.16b, v2.16b, v3.16b \n"
- "trn2 v19.16b, v2.16b, v3.16b \n"
- "trn1 v20.16b, v4.16b, v5.16b \n"
- "trn2 v21.16b, v4.16b, v5.16b \n"
- "trn1 v22.16b, v6.16b, v7.16b \n"
- "trn2 v23.16b, v6.16b, v7.16b \n"
-
- "trn1 v0.8h, v16.8h, v18.8h \n"
- "trn2 v1.8h, v16.8h, v18.8h \n"
- "trn1 v2.8h, v20.8h, v22.8h \n"
- "trn2 v3.8h, v20.8h, v22.8h \n"
- "trn1 v4.8h, v17.8h, v19.8h \n"
- "trn2 v5.8h, v17.8h, v19.8h \n"
- "trn1 v6.8h, v21.8h, v23.8h \n"
- "trn2 v7.8h, v21.8h, v23.8h \n"
-
- "trn1 v16.4s, v0.4s, v2.4s \n"
- "trn2 v17.4s, v0.4s, v2.4s \n"
- "trn1 v18.4s, v1.4s, v3.4s \n"
- "trn2 v19.4s, v1.4s, v3.4s \n"
- "trn1 v20.4s, v4.4s, v6.4s \n"
- "trn2 v21.4s, v4.4s, v6.4s \n"
- "trn1 v22.4s, v5.4s, v7.4s \n"
- "trn2 v23.4s, v5.4s, v7.4s \n"
+ "mov %0, %1 \n"
- "mov %0, %2 \n"
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
- "st1 {v16.d}[0], [%0], %6 \n"
- "st1 {v18.d}[0], [%0], %6 \n"
- "st1 {v17.d}[0], [%0], %6 \n"
- "st1 {v19.d}[0], [%0], %6 \n"
- "st1 {v16.d}[1], [%0], %6 \n"
- "st1 {v18.d}[1], [%0], %6 \n"
- "st1 {v17.d}[1], [%0], %6 \n"
- "st1 {v19.d}[1], [%0] \n"
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
- "mov %0, %3 \n"
+ "mov %0, %2 \n"
- "st1 {v20.d}[0], [%0], %7 \n"
- "st1 {v22.d}[0], [%0], %7 \n"
- "st1 {v21.d}[0], [%0], %7 \n"
- "st1 {v23.d}[0], [%0], %7 \n"
- "st1 {v20.d}[1], [%0], %7 \n"
- "st1 {v22.d}[1], [%0], %7 \n"
- "st1 {v21.d}[1], [%0], %7 \n"
- "st1 {v23.d}[1], [%0] \n"
-
- "add %1, %1, #16 \n" // src += 8*2
- "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
- "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
- "subs %w4, %w4, #8 \n" // w -= 8
- "b.ge 1b \n"
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w4, %w4, #8 \n"
- "b.eq 4f \n"
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w4, #2 \n"
- "b.lt 3f \n"
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- "cmp %w4, #4 \n"
- "b.lt 2f \n"
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
@@ -406,18 +423,57 @@ void TransposeUVWx8_NEON(const uint8_t* src,
"4: \n"
- : "=&r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(dst_a), // %2
- "+r"(dst_b), // %3
- "+r"(width) // %4
- : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
- "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
- "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
- "r"(&kVTbl4x4TransposeDi) // %8
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "r"((ptrdiff_t)src_stride), // %5
+ "r"((ptrdiff_t)dst_stride_a), // %6
+ "r"((ptrdiff_t)dst_stride_b), // %7
+ "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
}
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ const uint8_t* src1 = src + src_stride;
+ const uint8_t* src2 = src1 + src_stride;
+ const uint8_t* src3 = src2 + src_stride;
+ uint8_t* dst1 = dst + dst_stride;
+ uint8_t* dst2 = dst1 + dst_stride;
+ uint8_t* dst3 = dst2 + dst_stride;
+ asm volatile(
+ // Main loop transpose 4x4. Read a column, write a row.
+ "1: \n"
+ "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
+ "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
+ "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
+ "ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
+ "subs %w8, %w8, #4 \n" // w -= 4
+ "st1 {v0.4s}, [%4], 16 \n"
+ "st1 {v1.4s}, [%5], 16 \n"
+ "st1 {v2.4s}, [%6], 16 \n"
+ "st1 {v3.4s}, [%7], 16 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(dst1), // %5
+ "+r"(dst2), // %6
+ "+r"(dst3), // %7
+ "+r"(width) // %8
+ : "r"((ptrdiff_t)(src_stride * 4)) // %9
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/files/source/rotate_win.cc b/source/rotate_win.cc
index e887dd52..a78873f8 100644
--- a/files/source/rotate_win.cc
+++ b/source/rotate_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ !defined(__clang__) && defined(_M_IX86)
__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
int src_stride,
diff --git a/source/row_any.cc b/source/row_any.cc
new file mode 100644
index 00000000..e574543c
--- /dev/null
+++ b/source/row_any.cc
@@ -0,0 +1,2459 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h> // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// memset for vin is meant to clear the source buffer so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1
+#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t vin[64 * 4]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \
+ } \
+ memcpy(vin, y_buf + n, r); \
+ memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(vin + 192, a_buf + n, r); \
+ ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_MERGEARGBROW_SSE2
+ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7)
+#endif
+#ifdef HAS_MERGEARGBROW_AVX2
+ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_MERGEARGBROW_NEON
+ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
+#endif
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t vin[64 * 4]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, y_buf + n, r); \
+ memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(vin + 192, a_buf + n, r); \
+ if (width & 1) { \
+ vin[64 + SS(r, UVSHIFT)] = vin[64 + SS(r, UVSHIFT) - 1]; \
+ vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, yuvconstants, \
+ MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I444ALPHATOARGBROW_SSSE3
+ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_AVX2
+ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_NEON
+ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_MSA
+ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_LSX
+ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_LASX
+ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
+#endif
+#undef ANY41C
+
+// Any 4 planes to 1 plane of 8 bit with yuvconstants
+#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
+ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(T vin[16 * 4]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, y_buf + n, r * SBPP); \
+ memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(vin + 48, a_buf + n, r * SBPP); \
+ ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
+ I210AlphaToARGBRow_SSSE3,
+ 1,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 7)
+#endif
+
+#ifdef HAS_I210ALPHATOARGBROW_AVX2
+ANY41CT(I210AlphaToARGBRow_Any_AVX2,
+ I210AlphaToARGBRow_AVX2,
+ 1,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 15)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
+ I410AlphaToARGBRow_SSSE3,
+ 0,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 7)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_AVX2
+ANY41CT(I410AlphaToARGBRow_Any_AVX2,
+ I410AlphaToARGBRow_AVX2,
+ 0,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 15)
+#endif
+
+#undef ANY41CT
+
+// Any 4 planes to 1 plane with parameter
+#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
+ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+ const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
+ SIMD_ALIGNED(STYPE vin[16 * 4]); \
+ SIMD_ALIGNED(DTYPE vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \
+ } \
+ memcpy(vin, r_buf + n, r * SBPP); \
+ memcpy(vin + 16, g_buf + n, r * SBPP); \
+ memcpy(vin + 32, b_buf + n, r * SBPP); \
+ memcpy(vin + 48, a_buf + n, r * SBPP); \
+ ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, depth, MASK + 1); \
+ memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEAR64ROW_NEON
+ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+ANY41PT(MergeARGB16To8Row_Any_AVX2,
+ MergeARGB16To8Row_AVX2,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 15)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_NEON
+ANY41PT(MergeARGB16To8Row_Any_NEON,
+ MergeARGB16To8Row_NEON,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 7)
+#endif
+
+#undef ANY41PT
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t vin[64 * 3]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
+ } \
+ memcpy(vin, y_buf + n, r); \
+ memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ ANY_SIMD(vin, vin + 64, vin + 128, vout, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGEXRGBROW_SSE2
+ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
+#endif
+#ifdef HAS_MERGEXRGBROW_AVX2
+ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_MERGEXRGBROW_NEON
+ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_LSX
+ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_LASX
+ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_LSX
+ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_LASX
+ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t vin[128 * 3]); \
+ SIMD_ALIGNED(uint8_t vout[128]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, y_buf + n, r); \
+ memcpy(vin + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(vin + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ if (width & 1) { \
+ vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1]; \
+ vin[256 + SS(r, UVSHIFT)] = vin[256 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(vin, vin + 128, vin + 256, vout, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TORGBAROW_SSSE3
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_SSSE3
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_SSSE3
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_SSSE3
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB24ROW_SSSE3
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I444TORGB24ROW_SSSE3
+ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX512BW
+ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I444TORGB24ROW_AVX2
+ANY31C(I444ToRGB24Row_Any_AVX2, I444ToRGB24Row_AVX2, 0, 0, 3, 31)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I444TORGB24ROW_NEON
+ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_LSX
+ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15)
+ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_LASX
+ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31)
+ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31)
+#endif
+#ifdef HAS_I444TOARGBROW_LSX
+ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15)
+#endif
+#undef ANY31C
+
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \
+ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(T vin[16 * 3]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, y_buf + n, r * SBPP); \
+ memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(vin, vin + 16, vin + 32, vout, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I410TOAR30ROW_SSSE3
+ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_SSSE3
+ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_AVX2
+ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I410TOAR30ROW_AVX2
+ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_SSSE3
+ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_SSSE3
+ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_AVX2
+ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_AVX2
+ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#undef ANY31CT
+
+// Any 3 planes to 1 plane with parameter
+#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
+ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+ DTYPE* dst_ptr, int depth, int width) { \
+ SIMD_ALIGNED(STYPE vin[16 * 3]); \
+ SIMD_ALIGNED(DTYPE vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \
+ } \
+ memcpy(vin, r_buf + n, r * SBPP); \
+ memcpy(vin + 16, g_buf + n, r * SBPP); \
+ memcpy(vin + 32, b_buf + n, r * SBPP); \
+ ANY_SIMD(vin, vin + 16, vin + 32, vout, depth, MASK + 1); \
+ memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
+#endif
+
+#ifdef HAS_MERGEXR30ROW_NEON
+ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
+ANY31PT(MergeXR30Row_10_Any_NEON,
+ MergeXR30Row_10_NEON,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 3)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_NEON
+ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+ANY31PT(MergeXRGB16To8Row_Any_AVX2,
+ MergeXRGB16To8Row_AVX2,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 15)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_NEON
+ANY31PT(MergeXRGB16To8Row_Any_NEON,
+ MergeXRGB16To8Row_NEON,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 7)
+#endif
+
+#undef ANY31PT
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t vin[128 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[128]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
+ } \
+ memcpy(vin, y_buf + n * SBPP, r * SBPP); \
+ memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(vin, vin + 128, vout, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX512BW
+ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_LSX
+ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_SSSE3
+ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_LSX
+ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_LASX
+ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_LSX
+ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_LASX
+ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_LSX
+ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_LASX
+ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_LSX
+ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_LSX
+ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_LSX
+ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with stride
+// width is measured in source pixels. 4 bytes contains 2 pixels
+#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t vin[32 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[32]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int awidth = (width + 1) / 2; \
+ int r = awidth & MASK; \
+ int n = awidth & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2); \
+ } \
+ memcpy(vin, src_yuy2 + n * SBPP, r * SBPP); \
+ memcpy(vin + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP); \
+ ANY_SIMD(vin, 32, vout, MASK + 1); \
+ memcpy(dst_uv + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_YUY2TONVUVROW_NEON
+ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_SSE2
+ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_AVX2
+ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15)
+#endif
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t vin[128 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[128]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, y_buf + n * SBPP, r * SBPP); \
+ memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(vin, vin + 128, vout, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_LSX
+ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_LASX
+ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_LSX
+ANY21C(NV21ToARGBRow_Any_LSX, NV21ToARGBRow_LSX, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_LASX
+ANY21C(NV21ToARGBRow_Any_LASX, NV21ToARGBRow_LASX, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_LSX
+ANY21C(NV12ToRGB565Row_Any_LSX, NV12ToRGB565Row_LSX, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_LASX
+ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15)
+#endif
+#undef ANY21C
+
+// Any 2 planes of 16 bit to 1 with yuvconstants
+#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(T vin[16 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, y_buf + n, r * SBPP); \
+ memcpy(vin + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
+ ANY_SIMD(vin, vin + 16, vout, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_P210TOAR30ROW_SSSE3
+ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_SSSE3
+ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_AVX2
+ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P210TOAR30ROW_AVX2
+ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_SSSE3
+ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_SSSE3
+ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_AVX2
+ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_AVX2
+ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+
+#undef ANY21CT
+
+// Any 2 16 bit planes with parameter to 1
+#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
+ int width) { \
+ SIMD_ALIGNED(T vin[16 * 2]); \
+ SIMD_ALIGNED(T vout[16]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_u, src_v, dst_uv, depth, n); \
+ } \
+ memcpy(vin, src_u + n, r * BPP); \
+ memcpy(vin + 16, src_v + n, r * BPP); \
+ ANY_SIMD(vin, vin + 16, vout, depth, MASK + 1); \
+ memcpy(dst_uv + n * 2, vout, r * BPP * 2); \
+ }
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7)
+#endif
+#ifdef HAS_MERGEUVROW_16_NEON
+ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t vin[128]); \
+ SIMD_ALIGNED(uint8_t vout[128]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(vin, vout, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15)
+#endif
+#if defined(HAS_J400TOARGBROW_LSX)
+ANY11(J400ToARGBRow_Any_LSX, J400ToARGBRow_LSX, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_LSX)
+ANY11(RAWToRGB24Row_Any_LSX, RAWToRGB24Row_LSX, 0, 3, 3, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYJROW_AVX2
+ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_SSE2
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_SSSE3
+ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_LSX
+ANY11(ARGBToYRow_Any_LSX, ARGBToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_LASX
+ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_NEON
+ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_LSX
+ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_LSX
+ANY11(RGBAToYJRow_Any_LSX, RGBAToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_LSX
+ANY11(ABGRToYJRow_Any_LSX, ABGRToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_LASX
+ANY11(RGBAToYJRow_Any_LASX, RGBAToYJRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_LASX
+ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYJROW_LASX
+ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_LSX
+ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_LASX
+ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_LSX
+ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYROW_LASX
+ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_LSX
+ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_LASX
+ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_LSX
+ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_LSX
+ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_LASX
+ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYROW_LASX
+ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_LSX
+ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_LASX
+ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_LSX
+ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_LASX
+ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_LSX
+ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_LASX
+ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_LSX
+ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_LASX
+ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_LSX
+ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_LASX
+ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_LSX
+ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_LASX
+ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_LSX
+ANY11(RGB24ToARGBRow_Any_LSX, RGB24ToARGBRow_LSX, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_LASX
+ANY11(RGB24ToARGBRow_Any_LASX, RGB24ToARGBRow_LASX, 0, 3, 4, 31)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_LSX
+ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_LASX
+ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_LSX
+ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_LASX
+ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_LSX
+ANY11(ARGB1555ToARGBRow_Any_LSX, ARGB1555ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_LASX
+ANY11(ARGB1555ToARGBRow_Any_LASX, ARGB1555ToARGBRow_LASX, 0, 2, 4, 31)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_LSX
+ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_LASX
+ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_LSX
+ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_LASX
+ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_LSX
+ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended. Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t vin[64]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ memset(vout, 0, sizeof(vout)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ memcpy(vout, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(vin, vout, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+ SIMD_ALIGNED(uint8_t vin[64]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(vin, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(vin, vout, param, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+ I400ToARGBRow_SSE2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+ I400ToARGBRow_AVX2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+ I400ToARGBRow_NEON,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+ I400ToARGBRow_MSA,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_LSX)
+ANY11P(I400ToARGBRow_Any_LSX,
+ I400ToARGBRow_LSX,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+ ARGBToRGB565DitherRow_SSE2,
+ const uint32_t,
+ 4,
+ 2,
+ 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+ ARGBToRGB565DitherRow_AVX2,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+ ARGBToRGB565DitherRow_NEON,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+ ARGBToRGB565DitherRow_MSA,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+ANY11P(ARGBToRGB565DitherRow_Any_LSX,
+ ARGBToRGB565DitherRow_LSX,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ANY11P(ARGBToRGB565DitherRow_Any_LASX,
+ ARGBToRGB565DitherRow_LASX,
+ const uint32_t,
+ 4,
+ 2,
+ 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_LSX
+ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_LASX
+ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
+#endif
+#undef ANY11P
+#undef ANY11P
+
+// Any 1 to 1 with type
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
+ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]); \
+ SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(vin, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \
+ ANY_SIMD((STYPE*)vin, (DTYPE*)vout, MASK + 1); \
+ memcpy((uint8_t*)(dst_ptr) + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_SSSE3
+ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_SSSE3
+ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_NEON
+ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_NEON
+ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#undef ANY11T
+
+// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
+ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+ SIMD_ALIGNED(STYPE vin[32]); \
+ SIMD_ALIGNED(DTYPE vout[32]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, scale, n); \
+ } \
+ memcpy(vin, src_ptr + n, r * SBPP); \
+ ANY_SIMD(vin, vout, scale, MASK + 1); \
+ memcpy(dst_ptr + n, vout, r * BPP); \
+ }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+ Convert16To8Row_SSSE3,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+ Convert16To8Row_AVX2,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 31)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_NEON
+ANY11C(Convert16To8Row_Any_NEON,
+ Convert16To8Row_NEON,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+ Convert8To16Row_SSE2,
+ 1,
+ 2,
+ uint8_t,
+ uint16_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+ Convert8To16Row_AVX2,
+ 1,
+ 2,
+ uint8_t,
+ uint16_t,
+ 31)
+#endif
+#ifdef HAS_MULTIPLYROW_16_AVX2
+ANY11C(MultiplyRow_16_Any_AVX2,
+ MultiplyRow_16_AVX2,
+ 2,
+ 2,
+ uint16_t,
+ uint16_t,
+ 31)
+#endif
+#ifdef HAS_MULTIPLYROW_16_NEON
+ANY11C(MultiplyRow_16_Any_NEON,
+ MultiplyRow_16_NEON,
+ 2,
+ 2,
+ uint16_t,
+ uint16_t,
+ 15)
+#endif
+#ifdef HAS_DIVIDEROW_16_AVX2
+ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31)
+#endif
+#ifdef HAS_DIVIDEROW_16_NEON
+ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
+ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+ SIMD_ALIGNED(ST vin[32]); \
+ SIMD_ALIGNED(T vout[32]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(vin, src_ptr + n, r * SBPP); \
+ ANY_SIMD(vin, vout, param, MASK + 1); \
+ memcpy(dst_ptr + n, vout, r * BPP); \
+ }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+ HalfFloat1Row_F16C,
+ uint16_t,
+ uint16_t,
+ 2,
+ 2,
+ 15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+ HalfFloat1Row_NEON,
+ uint16_t,
+ uint16_t,
+ 2,
+ 2,
+ 7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_LSX
+ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#undef ANY11P16
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t vin[128]); \
+ SIMD_ALIGNED(uint8_t vout[128]); \
+ memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(vin, vout, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_LSX)
+ANY11C(YUY2ToARGBRow_Any_LSX, YUY2ToARGBRow_LSX, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
+#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
+ void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+ int width, int source_y_fraction) { \
+ SIMD_ALIGNED(TS vin[64 * 2]); \
+ SIMD_ALIGNED(TD vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
+ } \
+ memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
+ if (source_y_fraction) { \
+ memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \
+ r * SBPP * sizeof(TS)); \
+ } \
+ ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11I(InterpolateRow_Any_SSSE3,
+ InterpolateRow_SSSE3,
+ uint8_t,
+ uint8_t,
+ 1,
+ 1,
+ 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_LSX
+ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31)
+#endif
+
+#ifdef HAS_INTERPOLATEROW_16_NEON
+ANY11I(InterpolateRow_16_Any_NEON,
+ InterpolateRow_16_NEON,
+ uint16_t,
+ uint16_t,
+ 1,
+ 1,
+ 7)
+#endif
+#undef ANY11I
+
+// Any 1 to 1 interpolate with scale param
+#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
+ void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+ int scale, int width, int source_y_fraction) { \
+ SIMD_ALIGNED(TS vin[64 * 2]); \
+ SIMD_ALIGNED(TD vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
+ } \
+ memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
+ if (source_y_fraction) { \
+ memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \
+ r * SBPP * sizeof(TS)); \
+ } \
+ ANY_SIMD(vout, vin, 64, scale, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_16TO8_NEON
+ANY11IS(InterpolateRow_16To8_Any_NEON,
+ InterpolateRow_16To8_NEON,
+ uint8_t,
+ uint16_t,
+ 1,
+ 1,
+ 7)
+#endif
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+ANY11IS(InterpolateRow_16To8_Any_AVX2,
+ InterpolateRow_16To8_AVX2,
+ uint8_t,
+ uint16_t,
+ 1,
+ 1,
+ 31)
+#endif
+
+#undef ANY11IS
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t vin[64]); \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
+ } \
+ memcpy(vin, src_ptr, r* BPP); \
+ ANY_SIMD(vin, vout, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
+ }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
+#ifdef HAS_MIRRORROW_LSX
+ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_LASX
+ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_LSX
+ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_LASX
+ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_LSX
+ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_LASX
+ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+ SIMD_ALIGNED(uint8_t vout[64]); \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, v32, n); \
+ } \
+ ANY_SIMD(vout, v32, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
+ }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
+#endif
+#ifdef HAS_SETROW_LSX
+ANY1(SetRow_Any_LSX, SetRow_LSX, uint8_t, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_LSX
+ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2. Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t vin[128]); \
+ SIMD_ALIGNED(uint8_t vout[128 * 2]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ ANY_SIMD(vin, vout, vout + 128, MASK + 1); \
+ memcpy(dst_u + (n >> DUVSHIFT), vout, SS(r, DUVSHIFT)); \
+ memcpy(dst_v + (n >> DUVSHIFT), vout + 128, SS(r, DUVSHIFT)); \
+ }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_LSX
+ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_LSX
+ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_LASX
+ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31)
+ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31)
+#endif
+#undef ANY12
+
+// Any 2 16 bit planes with parameter to 1
+#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
+ SIMD_ALIGNED(T vin[16 * 2]); \
+ SIMD_ALIGNED(T vout[16 * 2]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \
+ } \
+ memcpy(vin, src_uv + n * 2, r * BPP * 2); \
+ ANY_SIMD(vin, vout, vout + 16, depth, MASK + 1); \
+ memcpy(dst_u + n, vout, r * BPP); \
+ memcpy(dst_v + n, vout + 16, r * BPP); \
+ }
+
+#ifdef HAS_SPLITUVROW_16_AVX2
+ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15)
+#endif
+
+#ifdef HAS_SPLITUVROW_16_NEON
+ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
+// Any 1 to 3. Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+ uint8_t* dst_b, int width) { \
+ SIMD_ALIGNED(uint8_t vin[16 * 3]); \
+ SIMD_ALIGNED(uint8_t vout[16 * 3]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
+ } \
+ memcpy(vin, src_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(vin, vout, vout + 16, vout + 32, MASK + 1); \
+ memcpy(dst_r + n, vout, r); \
+ memcpy(dst_g + n, vout + 16, r); \
+ memcpy(dst_b + n, vout + 32, r); \
+ }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+#ifdef HAS_SPLITXRGBROW_SSE2
+ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
+#endif
+#ifdef HAS_SPLITXRGBROW_SSSE3
+ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
+#endif
+#ifdef HAS_SPLITXRGBROW_AVX2
+ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
+#endif
+#ifdef HAS_SPLITXRGBROW_NEON
+ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
+#endif
+
+// Any 1 to 4. Outputs ARGB planes.
+#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+ uint8_t* dst_b, uint8_t* dst_a, int width) { \
+ SIMD_ALIGNED(uint8_t vin[16 * 4]); \
+ SIMD_ALIGNED(uint8_t vout[16 * 4]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \
+ } \
+ memcpy(vin, src_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(vin, vout, vout + 16, vout + 32, vout + 48, MASK + 1); \
+ memcpy(dst_r + n, vout, r); \
+ memcpy(dst_g + n, vout + 16, r); \
+ memcpy(dst_b + n, vout + 32, r); \
+ memcpy(dst_a + n, vout + 48, r); \
+ }
+
+#ifdef HAS_SPLITARGBROW_SSE2
+ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7)
+#endif
+#ifdef HAS_SPLITARGBROW_SSSE3
+ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7)
+#endif
+#ifdef HAS_SPLITARGBROW_AVX2
+ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15)
+#endif
+#ifdef HAS_SPLITARGBROW_NEON
+ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
+#endif
+
+// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \
+ uint8_t* dst_v, int width) { \
+ SIMD_ALIGNED(uint8_t vin[128 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[128 * 2]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \
+ } \
+ memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \
+ vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1); \
+ memcpy(dst_u + (n >> 1), vout, SS(r, 1)); \
+ memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1)); \
+ }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVJROW_AVX2
+ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_LSX
+ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_LASX
+ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVJROW_NEON
+ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_LSX
+ANY12S(ARGBToUVJRow_Any_LSX, ARGBToUVJRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_LASX
+ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_LSX
+ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_LSX
+ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_LSX
+ANY12S(RGBAToUVRow_Any_LSX, RGBAToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVJROW_NEON
+ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_LSX
+ANY12S(RGB24ToUVRow_Any_LSX, RGB24ToUVRow_LSX, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_LASX
+ANY12S(RGB24ToUVRow_Any_LASX, RGB24ToUVRow_LASX, 0, 3, 31)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVJROW_NEON
+ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_LSX
+ANY12S(RAWToUVRow_Any_LSX, RAWToUVRow_LSX, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_LASX
+ANY12S(RAWToUVRow_Any_LASX, RAWToUVRow_LASX, 0, 3, 31)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_LSX
+ANY12S(RGB565ToUVRow_Any_LSX, RGB565ToUVRow_LSX, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_LASX
+ANY12S(RGB565ToUVRow_Any_LASX, RGB565ToUVRow_LASX, 0, 2, 31)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_LSX
+ANY12S(ARGB1555ToUVRow_Any_LSX, ARGB1555ToUVRow_LSX, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_LASX
+ANY12S(ARGB1555ToUVRow_Any_LASX, ARGB1555ToUVRow_LASX, 0, 2, 31)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_LSX
+ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_LASX
+ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_LSX
+ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_LASX
+ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
+#endif
+#undef ANY12S
+
+// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t vin[128 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[128]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride, dst_vu, n); \
+ } \
+ memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \
+ vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(vin, 128, vout, MASK + 1); \
+ memcpy(dst_vu + (n >> 1) * 2, vout, SS(r, 1) * 2); \
+ }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
+#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \
+ SIMD_ALIGNED(T vin[16]); \
+ SIMD_ALIGNED(T vout[16]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src, src_tile_stride, dst, n); \
+ } \
+ memcpy(vin, src + (n / 16) * src_tile_stride, r * BPP); \
+ ANY_SIMD(vin, src_tile_stride, vout, MASK + 1); \
+ memcpy(dst + n, vout, r * BPP); \
+ }
+
+#ifdef HAS_DETILEROW_NEON
+ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15)
+#endif
+#ifdef HAS_DETILEROW_SSE2
+ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15)
+#endif
+#ifdef HAS_DETILEROW_16_NEON
+ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_SSE2
+ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_AVX
+ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15)
+#endif
+
+// DetileSplitUVRow width is in bytes
+#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \
+ void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ SIMD_ALIGNED(uint8_t vin[16]); \
+ SIMD_ALIGNED(uint8_t vout[8 * 2]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \
+ } \
+ memcpy(vin, src_uv + (n / 16) * src_tile_stride, r); \
+ ANY_SIMD(vin, src_tile_stride, vout, vout + 8, r); \
+ memcpy(dst_u + n / 2, vout, (r + 1) / 2); \
+ memcpy(dst_v + n / 2, vout + 8, (r + 1) / 2); \
+ }
+
+#ifdef HAS_DETILESPLITUVROW_NEON
+ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
+#endif
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
+#endif
+
+#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK) \
+ void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, \
+ const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, \
+ uint8_t* dst_yuy2, int width) { \
+ SIMD_ALIGNED(uint8_t vin[16 * 2]); \
+ SIMD_ALIGNED(uint8_t vout[16 * 2]); \
+ memset(vin, 0, sizeof(vin)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \
+ n); \
+ } \
+ memcpy(vin, src_y + (n / 16) * src_y_tile_stride, r); \
+ memcpy(vin + 16, src_uv + (n / 16) * src_uv_tile_stride, r); \
+ ANY_SIMD(vin, src_y_tile_stride, vin + 16, src_uv_tile_stride, vout, r); \
+ memcpy(dst_yuy2 + 2 * n, vout, 2 * r); \
+ }
+
+#ifdef HAS_DETILETOYUY2_NEON
+ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
+#endif
+
+#ifdef HAS_DETILETOYUY2_SSE2
+ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_common.cc b/source/row_common.cc
index 8951d003..3afc4b4d 100644
--- a/files/source/row_common.cc
+++ b/source/row_common.cc
@@ -10,34 +10,72 @@
#include "libyuv/row.h"
-#include <stdio.h>
+#include <assert.h>
#include <string.h> // For memcpy and memset.
#include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h" // For kYuvI601Constants
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
+// This macro controls YUV to RGB using unsigned math to extend range of
+// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
+// LIBYUV_UNLIMITED_DATA
+
+// Macros to enable unlimited data for each colorspace
+// LIBYUV_UNLIMITED_BT601
+// LIBYUV_UNLIMITED_BT709
+// LIBYUV_UNLIMITED_BT2020
+
+// The following macro from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point for ARGBToI420:
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+ defined(_MSC_VER) && !defined(__clang__) && \
+ (defined(_M_IX86) || defined(_M_X64))
+#define LIBYUV_RGB7 1
+#endif
+
+#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86))
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+#if defined(LIBYUV_BIT_EXACT)
+#define LIBYUV_UNATTENUATE_DUP 1
+#endif
+
// llvm x86 is poor at ternary operator, so use branchless min/max.
#define USE_BRANCHLESS 1
#if USE_BRANCHLESS
static __inline int32_t clamp0(int32_t v) {
- return ((-(v) >> 31) & (v));
+ return -(v >= 0) & v;
}
-
+// TODO(fbarchard): make clamp255 preserve negative values.
static __inline int32_t clamp255(int32_t v) {
- return (((255 - (v)) >> 31) | (v)) & 255;
+ return (-(v >= 255) | v) & 255;
}
static __inline int32_t clamp1023(int32_t v) {
- return (((1023 - (v)) >> 31) | (v)) & 1023;
+ return (-(v >= 1023) | v) & 1023;
+}
+
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+ return (-(v >= max) | v) & max;
}
static __inline uint32_t Abs(int32_t v) {
- int m = v >> 31;
+ int m = -(v < 0);
return (v + m) ^ m;
}
#else // USE_BRANCHLESS
@@ -53,6 +91,10 @@ static __inline int32_t clamp1023(int32_t v) {
return (v > 1023) ? 1023 : v;
}
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+ return (v > max) ? max : v;
+}
+
static __inline uint32_t Abs(int32_t v) {
return (v < 0) ? -v : v;
}
@@ -111,6 +153,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
}
}
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
+ dst_rgba[0] = 255u;
+ dst_rgba[1] = b;
+ dst_rgba[2] = g;
+ dst_rgba[3] = r;
+ dst_rgba += 4;
+ src_raw += 3;
+ }
+}
+
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -130,12 +187,13 @@ void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8_t b = src_rgb565[0] & 0x1f;
- uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8_t r = src_rgb565[1] >> 3;
- dst_argb[0] = (b << 3) | (b >> 2);
- dst_argb[1] = (g << 2) | (g >> 4);
- dst_argb[2] = (r << 3) | (r >> 2);
+ uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+ uint8_t g = STATIC_CAST(
+ uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+ uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+ dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+ dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+ dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
dst_argb[3] = 255u;
dst_argb += 4;
src_rgb565 += 2;
@@ -147,13 +205,14 @@ void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8_t b = src_argb1555[0] & 0x1f;
- uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
- uint8_t a = src_argb1555[1] >> 7;
- dst_argb[0] = (b << 3) | (b >> 2);
- dst_argb[1] = (g << 3) | (g >> 2);
- dst_argb[2] = (r << 3) | (r >> 2);
+ uint8_t b = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+ uint8_t g = STATIC_CAST(
+ uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+ uint8_t r = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+ uint8_t a = STATIC_CAST(uint8_t, src_argb1555[1] >> 7);
+ dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+ dst_argb[1] = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+ dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
dst_argb[3] = -a;
dst_argb += 4;
src_argb1555 += 2;
@@ -165,14 +224,14 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8_t b = src_argb4444[0] & 0x0f;
- uint8_t g = src_argb4444[0] >> 4;
- uint8_t r = src_argb4444[1] & 0x0f;
- uint8_t a = src_argb4444[1] >> 4;
- dst_argb[0] = (b << 4) | b;
- dst_argb[1] = (g << 4) | g;
- dst_argb[2] = (r << 4) | r;
- dst_argb[3] = (a << 4) | a;
+ uint8_t b = STATIC_CAST(uint8_t, src_argb4444[0] & 0x0f);
+ uint8_t g = STATIC_CAST(uint8_t, src_argb4444[0] >> 4);
+ uint8_t r = STATIC_CAST(uint8_t, src_argb4444[1] & 0x0f);
+ uint8_t a = STATIC_CAST(uint8_t, src_argb4444[1] >> 4);
+ dst_argb[0] = STATIC_CAST(uint8_t, (b << 4) | b);
+ dst_argb[1] = STATIC_CAST(uint8_t, (g << 4) | g);
+ dst_argb[2] = STATIC_CAST(uint8_t, (r << 4) | r);
+ dst_argb[3] = STATIC_CAST(uint8_t, (a << 4) | a);
dst_argb += 4;
src_argb4444 += 2;
}
@@ -181,7 +240,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -195,7 +255,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -209,7 +270,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = ar30 & 0x3ff;
uint32_t ga = ar30 & 0xc00ffc00;
uint32_t r = (ar30 >> 20) & 0x3ff;
@@ -219,6 +281,54 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
}
}
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
+ uint8_t a = src_argb[3];
+ dst_abgr[0] = r;
+ dst_abgr[1] = g;
+ dst_abgr[2] = b;
+ dst_abgr[3] = a;
+ dst_abgr += 4;
+ src_argb += 4;
+ }
+}
+
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
+ uint8_t a = src_argb[3];
+ dst_bgra[0] = a;
+ dst_bgra[1] = r;
+ dst_bgra[2] = g;
+ dst_bgra[3] = b;
+ dst_bgra += 4;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
+ uint8_t a = src_argb[3];
+ dst_rgba[0] = a;
+ dst_rgba[1] = b;
+ dst_rgba[2] = g;
+ dst_rgba[3] = r;
+ dst_rgba += 4;
+ src_argb += 4;
+ }
+}
+
void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -247,6 +357,22 @@ void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
}
}
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t a = src_rgba[0];
+ uint8_t b = src_rgba[1];
+ uint8_t g = src_rgba[2];
+ uint8_t r = src_rgba[3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ dst_argb += 4;
+ src_rgba += 4;
+ }
+}
+
void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -265,7 +391,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t b0 = src_argb[0] >> 3;
uint8_t g0 = src_argb[1] >> 2;
uint8_t r0 = src_argb[2] >> 3;
- *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
}
}
@@ -279,29 +405,31 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
// or the upper byte for big endian.
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
uint8_t* dst_rgb,
- const uint32_t dither4,
+ uint32_t dither4,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
int dither0 = ((const unsigned char*)(&dither4))[x & 3];
int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
- uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
- uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
- uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
- uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
- uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
- uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
- WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
- (r1 << 27));
+ uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+ uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+ uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+ uint8_t b1 = STATIC_CAST(uint8_t, clamp255(src_argb[4] + dither1) >> 3);
+ uint8_t g1 = STATIC_CAST(uint8_t, clamp255(src_argb[5] + dither1) >> 2);
+ uint8_t r1 = STATIC_CAST(uint8_t, clamp255(src_argb[6] + dither1) >> 3);
+ *(uint16_t*)(dst_rgb + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+ *(uint16_t*)(dst_rgb + 2) =
+ STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
dst_rgb += 4;
src_argb += 8;
}
if (width & 1) {
int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
- uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
- uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
- uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
- *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+ uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+ uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+ *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
}
}
@@ -316,8 +444,10 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t g1 = src_argb[5] >> 3;
uint8_t r1 = src_argb[6] >> 3;
uint8_t a1 = src_argb[7] >> 7;
- *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ *(uint16_t*)(dst_rgb + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
+ *(uint16_t*)(dst_rgb + 2) =
+ STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | (a1 << 15));
dst_rgb += 4;
src_argb += 8;
}
@@ -326,7 +456,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t g0 = src_argb[1] >> 3;
uint8_t r0 = src_argb[2] >> 3;
uint8_t a0 = src_argb[3] >> 7;
- *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ *(uint16_t*)(dst_rgb) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
}
}
@@ -341,8 +472,10 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t g1 = src_argb[5] >> 4;
uint8_t r1 = src_argb[6] >> 4;
uint8_t a1 = src_argb[7] >> 4;
- *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
- (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ *(uint16_t*)(dst_rgb + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
+ *(uint16_t*)(dst_rgb + 2) =
+ STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | (a1 << 12));
dst_rgb += 4;
src_argb += 8;
}
@@ -351,18 +484,20 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t g0 = src_argb[1] >> 4;
uint8_t r0 = src_argb[2] >> 4;
uint8_t a0 = src_argb[3] >> 4;
- *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ *(uint16_t*)(dst_rgb) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
}
}
void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+ uint32_t r0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
- uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+ uint32_t b0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
uint32_t a0 = (src_abgr[3] >> 6);
- *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+ *(uint32_t*)(dst_ar30) =
+ STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
dst_ar30 += 4;
src_abgr += 4;
}
@@ -375,62 +510,249 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
uint32_t a0 = (src_argb[3] >> 6);
- *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+ *(uint32_t*)(dst_ar30) =
+ STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
dst_ar30 += 4;
src_argb += 4;
}
}
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
- return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint16_t b = src_argb[0] * 0x0101;
+ uint16_t g = src_argb[1] * 0x0101;
+ uint16_t r = src_argb[2] * 0x0101;
+ uint16_t a = src_argb[3] * 0x0101;
+ dst_ar64[0] = b;
+ dst_ar64[1] = g;
+ dst_ar64[2] = r;
+ dst_ar64[3] = a;
+ dst_ar64 += 4;
+ src_argb += 4;
+ }
+}
+
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint16_t b = src_argb[0] * 0x0101;
+ uint16_t g = src_argb[1] * 0x0101;
+ uint16_t r = src_argb[2] * 0x0101;
+ uint16_t a = src_argb[3] * 0x0101;
+ dst_ab64[0] = r;
+ dst_ab64[1] = g;
+ dst_ab64[2] = b;
+ dst_ab64[3] = a;
+ dst_ab64 += 4;
+ src_argb += 4;
+ }
+}
+
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_ar64[0] >> 8;
+ uint8_t g = src_ar64[1] >> 8;
+ uint8_t r = src_ar64[2] >> 8;
+ uint8_t a = src_ar64[3] >> 8;
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ dst_argb += 4;
+ src_ar64 += 4;
+ }
+}
+
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_ab64[0] >> 8;
+ uint8_t g = src_ab64[1] >> 8;
+ uint8_t b = src_ab64[2] >> 8;
+ uint8_t a = src_ab64[3] >> 8;
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ dst_argb += 4;
+ src_ab64 += 4;
+ }
+}
+
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint16_t b = src_ar64[0];
+ uint16_t g = src_ar64[1];
+ uint16_t r = src_ar64[2];
+ uint16_t a = src_ar64[3];
+ dst_ab64[0] = r;
+ dst_ab64[1] = g;
+ dst_ab64[2] = b;
+ dst_ab64[3] = a;
+ dst_ab64 += 4;
+ src_ar64 += 4;
+ }
+}
+
+// TODO(fbarchard): Make shuffle compatible with SIMD versions
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+ uint8_t* dst_ar64,
+ const uint8_t* shuffler,
+ int width) {
+ const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
+ uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
+ int index0 = shuffler[0] / 2;
+ int index1 = shuffler[2] / 2;
+ int index2 = shuffler[4] / 2;
+ int index3 = shuffler[6] / 2;
+ // Shuffle a row of AR64.
+ int x;
+ for (x = 0; x < width / 2; ++x) {
+ // To support in-place conversion.
+ uint16_t b = src_ar64_16[index0];
+ uint16_t g = src_ar64_16[index1];
+ uint16_t r = src_ar64_16[index2];
+ uint16_t a = src_ar64_16[index3];
+ dst_ar64_16[0] = b;
+ dst_ar64_16[1] = g;
+ dst_ar64_16[2] = r;
+ dst_ar64_16[3] = a;
+ src_ar64_16 += 4;
+ dst_ar64_16 += 4;
+ }
+}
+
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+ return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16);
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+// 0x7e80) >> 8;
+
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+ return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
+}
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
+}
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
}
+#else
+// TODO(fbarchard): Add rounding to x86 SIMD and use this
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
+}
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
+}
+#endif
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
- return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+ return STATIC_CAST(
+ uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
}
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
- return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+ return STATIC_CAST(
+ uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
}
+#endif
// ARGBToY_C and ARGBToUV_C
-#define MAKEROWY(NAME, R, G, B, BPP) \
- void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
- } \
- void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
- uint8_t* dst_u, uint8_t* dst_v, int width) { \
- const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
- src_rgb1[B + BPP]) >> \
- 2; \
- uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
- src_rgb1[G + BPP]) >> \
- 2; \
- uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
- src_rgb1[R + BPP]) >> \
- 2; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- } \
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
+ AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
+ AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
+ AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+ }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = src_rgb[B] + src_rgb1[B]; \
+ uint16_t ag = src_rgb[G] + src_rgb1[G]; \
+ uint16_t ar = src_rgb[R] + src_rgb1[R]; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ } \
}
+#endif
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -448,14 +770,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// b 0.1016 * 255 = 25.908 = 25
// g 0.5078 * 255 = 129.489 = 129
// r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
+// JPeg 7 bit Y (deprecated)
// b 0.11400 * 128 = 14.592 = 15
// g 0.58700 * 128 = 75.136 = 75
// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
// JPeg 8 bit U:
// b 0.50000 * 255 = 127.5 = 127
// g -0.33126 * 255 = -84.4713 = -84
@@ -465,68 +787,132 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
return (38 * r + 75 * g + 15 * b + 64) >> 7;
}
+#else
+// 8 bit
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+#endif
-static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
}
-static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#else
+static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToYJ_C and ARGBToUVJ_C
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
- void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
- } \
- void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
- uint8_t* dst_u, uint8_t* dst_v, int width) { \
- const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
- AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
- uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
- AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
- uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
- AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
- uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
- uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- } \
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
+ AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
+ AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
+ AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = (src_rgb[B] + src_rgb1[B]); \
+ uint16_t ag = (src_rgb[G] + src_rgb1[G]); \
+ uint16_t ar = (src_rgb[R] + src_rgb1[R]); \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ } \
+ }
+
+#endif
MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(ABGR, 0, 1, 2, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
#undef MAKEROWYJ
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
uint8_t b = src_rgb565[0] & 0x1f;
- uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t g = STATIC_CAST(
+ uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
uint8_t r = src_rgb565[1] >> 3;
- b = (b << 3) | (b >> 2);
- g = (g << 2) | (g >> 4);
- r = (r << 3) | (r >> 2);
+ b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+ g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+ r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
dst_y[0] = RGBToY(r, g, b);
src_rgb565 += 2;
dst_y += 1;
@@ -537,11 +923,12 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
uint8_t b = src_argb1555[0] & 0x1f;
- uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t g = STATIC_CAST(
+ uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
- b = (b << 3) | (b >> 2);
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
+ b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+ g = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+ r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
dst_y[0] = RGBToY(r, g, b);
src_argb1555 += 2;
dst_y += 1;
@@ -554,9 +941,9 @@ void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
uint8_t b = src_argb4444[0] & 0x0f;
uint8_t g = src_argb4444[0] >> 4;
uint8_t r = src_argb4444[1] & 0x0f;
- b = (b << 4) | b;
- g = (g << 4) | g;
- r = (r << 4) | r;
+ b = STATIC_CAST(uint8_t, (b << 4) | b);
+ g = STATIC_CAST(uint8_t, (g << 4) | g);
+ r = STATIC_CAST(uint8_t, (r << 4) | r);
dst_y[0] = RGBToY(r, g, b);
src_argb4444 += 2;
dst_y += 1;
@@ -571,45 +958,84 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8_t b0 = src_rgb565[0] & 0x1f;
- uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8_t r0 = src_rgb565[1] >> 3;
- uint8_t b1 = src_rgb565[2] & 0x1f;
- uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
- uint8_t r1 = src_rgb565[3] >> 3;
- uint8_t b2 = next_rgb565[0] & 0x1f;
- uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
- uint8_t r2 = next_rgb565[1] >> 3;
- uint8_t b3 = next_rgb565[2] & 0x1f;
- uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
- uint8_t r3 = next_rgb565[3] >> 3;
- uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 787 -> 888.
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+ uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+ uint8_t g0 = STATIC_CAST(
+ uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+ uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+ uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f);
+ uint8_t g1 = STATIC_CAST(
+ uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3));
+ uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3);
+ uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+ uint8_t g2 = STATIC_CAST(
+ uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+ uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+ uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f);
+ uint8_t g3 = STATIC_CAST(
+ uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3));
+ uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3);
+
+ b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+ g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+ r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+ b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+ g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4));
+ r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+ b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+ g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+ r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+ b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+ g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
+ r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_rgb565 += 4;
next_rgb565 += 4;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
- uint8_t b0 = src_rgb565[0] & 0x1f;
- uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8_t r0 = src_rgb565[1] >> 3;
- uint8_t b2 = next_rgb565[0] & 0x1f;
- uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
- uint8_t r2 = next_rgb565[1] >> 3;
- uint8_t b = (b0 + b2); // 565 * 2 = 676.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 676 -> 888
- g = (g << 1) | (g >> 6);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+ uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+ uint8_t g0 = STATIC_CAST(
+ uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+ uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+ uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+ uint8_t g2 = STATIC_CAST(
+ uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+ uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+ b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+ g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+ r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+ b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+ g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+ r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -621,46 +1047,85 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8_t b0 = src_argb1555[0] & 0x1f;
- uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
- uint8_t b1 = src_argb1555[2] & 0x1f;
- uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
- uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
- uint8_t b2 = next_argb1555[0] & 0x1f;
- uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
- uint8_t b3 = next_argb1555[2] & 0x1f;
- uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
- uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
- uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 777 -> 888.
- g = (g << 1) | (g >> 6);
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+ uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+ uint8_t g0 = STATIC_CAST(
+ uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+ uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+ uint8_t b1 = STATIC_CAST(uint8_t, src_argb1555[2] & 0x1f);
+ uint8_t g1 = STATIC_CAST(
+ uint8_t, (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3));
+ uint8_t r1 = STATIC_CAST(uint8_t, (src_argb1555[3] & 0x7c) >> 2);
+ uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+ uint8_t g2 = STATIC_CAST(
+ uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+ uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+ uint8_t b3 = STATIC_CAST(uint8_t, next_argb1555[2] & 0x1f);
+ uint8_t g3 = STATIC_CAST(
+ uint8_t, (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3));
+ uint8_t r3 = STATIC_CAST(uint8_t, (next_argb1555[3] & 0x7c) >> 2);
+
+ b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+ g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+ r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+ b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+ g1 = STATIC_CAST(uint8_t, (g1 << 3) | (g1 >> 2));
+ r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+ b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+ g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+ r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+ b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+ g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2));
+ r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb1555 += 4;
next_argb1555 += 4;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
- uint8_t b0 = src_argb1555[0] & 0x1f;
- uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
- uint8_t b2 = next_argb1555[0] & 0x1f;
- uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8_t r2 = next_argb1555[1] >> 3;
- uint8_t b = (b0 + b2); // 555 * 2 = 666.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+ uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+ uint8_t g0 = STATIC_CAST(
+ uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+ uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+ uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+ uint8_t g2 = STATIC_CAST(
+ uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+ uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+
+ b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+ g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+ r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+ b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+ g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+ r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -684,14 +1149,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b3 = next_argb4444[2] & 0x0f;
uint8_t g3 = next_argb4444[2] >> 4;
uint8_t r3 = next_argb4444[3] & 0x0f;
- uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+ g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+ r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+ b1 = STATIC_CAST(uint8_t, (b1 << 4) | b1);
+ g1 = STATIC_CAST(uint8_t, (g1 << 4) | g1);
+ r1 = STATIC_CAST(uint8_t, (r1 << 4) | r1);
+ b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+ g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+ r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
+ b3 = STATIC_CAST(uint8_t, (b3 << 4) | b3);
+ g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3);
+ r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb4444 += 4;
next_argb4444 += 4;
dst_u += 1;
@@ -704,14 +1189,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b2 = next_argb4444[0] & 0x0f;
uint8_t g2 = next_argb4444[0] >> 4;
uint8_t r2 = next_argb4444[1] & 0x0f;
- uint8_t b = (b0 + b2); // 444 * 2 = 555.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 3) | (b >> 2); // 555 -> 888.
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+ g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+ r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+ b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+ g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+ r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -754,9 +1252,9 @@ void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
int sg = (b * 22 + g * 88 + r * 45) >> 7;
int sr = (b * 24 + g * 98 + r * 50) >> 7;
// b does not over flow. a is preserved from original.
- dst_argb[0] = sb;
- dst_argb[1] = clamp255(sg);
- dst_argb[2] = clamp255(sr);
+ dst_argb[0] = STATIC_CAST(uint8_t, sb);
+ dst_argb[1] = STATIC_CAST(uint8_t, clamp255(sg));
+ dst_argb[2] = STATIC_CAST(uint8_t, clamp255(sr));
dst_argb += 4;
}
}
@@ -785,10 +1283,10 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb,
int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
a * matrix_argb[15]) >>
6;
- dst_argb[0] = Clamp(sb);
- dst_argb[1] = Clamp(sg);
- dst_argb[2] = Clamp(sr);
- dst_argb[3] = Clamp(sa);
+ dst_argb[0] = STATIC_CAST(uint8_t, Clamp(sb));
+ dst_argb[1] = STATIC_CAST(uint8_t, Clamp(sg));
+ dst_argb[2] = STATIC_CAST(uint8_t, Clamp(sr));
+ dst_argb[3] = STATIC_CAST(uint8_t, Clamp(sa));
src_argb += 4;
dst_argb += 4;
}
@@ -838,9 +1336,12 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb,
int b = dst_argb[0];
int g = dst_argb[1];
int r = dst_argb[2];
- dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
- dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
- dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+ dst_argb[0] = STATIC_CAST(
+ uint8_t, (b * scale >> 16) * interval_size + interval_offset);
+ dst_argb[1] = STATIC_CAST(
+ uint8_t, (g * scale >> 16) * interval_size + interval_offset);
+ dst_argb[2] = STATIC_CAST(
+ uint8_t, (r * scale >> 16) * interval_size + interval_offset);
dst_argb += 4;
}
}
@@ -877,25 +1378,25 @@ void ARGBShadeRow_C(const uint8_t* src_argb,
#define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v* f >> 16
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const uint32_t b = REPEAT8(src_argb0[0]);
- const uint32_t g = REPEAT8(src_argb0[1]);
- const uint32_t r = REPEAT8(src_argb0[2]);
- const uint32_t a = REPEAT8(src_argb0[3]);
+ const uint32_t b = REPEAT8(src_argb[0]);
+ const uint32_t g = REPEAT8(src_argb[1]);
+ const uint32_t r = REPEAT8(src_argb[2]);
+ const uint32_t a = REPEAT8(src_argb[3]);
const uint32_t b_scale = src_argb1[0];
const uint32_t g_scale = src_argb1[1];
const uint32_t r_scale = src_argb1[2];
const uint32_t a_scale = src_argb1[3];
- dst_argb[0] = SHADE(b, b_scale);
- dst_argb[1] = SHADE(g, g_scale);
- dst_argb[2] = SHADE(r, r_scale);
- dst_argb[3] = SHADE(a, a_scale);
- src_argb0 += 4;
+ dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_scale));
+ dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_scale));
+ dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_scale));
+ dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_scale));
+ src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@@ -905,25 +1406,25 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
#define SHADE(f, v) clamp255(v + f)
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const int b = src_argb0[0];
- const int g = src_argb0[1];
- const int r = src_argb0[2];
- const int a = src_argb0[3];
+ const int b = src_argb[0];
+ const int g = src_argb[1];
+ const int r = src_argb[2];
+ const int a = src_argb[3];
const int b_add = src_argb1[0];
const int g_add = src_argb1[1];
const int r_add = src_argb1[2];
const int a_add = src_argb1[3];
- dst_argb[0] = SHADE(b, b_add);
- dst_argb[1] = SHADE(g, g_add);
- dst_argb[2] = SHADE(r, r_add);
- dst_argb[3] = SHADE(a, a_add);
- src_argb0 += 4;
+ dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_add));
+ dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_add));
+ dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_add));
+ dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_add));
+ src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@@ -932,25 +1433,25 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
#define SHADE(f, v) clamp0(f - v)
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const int b = src_argb0[0];
- const int g = src_argb0[1];
- const int r = src_argb0[2];
- const int a = src_argb0[3];
+ const int b = src_argb[0];
+ const int g = src_argb[1];
+ const int r = src_argb[2];
+ const int a = src_argb[3];
const int b_sub = src_argb1[0];
const int g_sub = src_argb1[1];
const int r_sub = src_argb1[2];
const int a_sub = src_argb1[3];
- dst_argb[0] = SHADE(b, b_sub);
- dst_argb[1] = SHADE(g, g_sub);
- dst_argb[2] = SHADE(r, r_sub);
- dst_argb[3] = SHADE(a, a_sub);
- src_argb0 += 4;
+ dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_sub));
+ dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_sub));
+ dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_sub));
+ dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_sub));
+ src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@@ -1058,257 +1559,244 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
}
}
-// TODO(fbarchard): Unify these structures to be platform independent.
-// TODO(fbarchard): Generate SIMD structures from float matrix.
+// Macros to create SIMD specific yuv to rgb conversion constants.
-// BT.601 YUV to RGB reference
-// R = (Y - 16) * 1.164 - V * -1.596
-// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
-// B = (Y - 16) * 1.164 - U * -2.018
+// clang-format off
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// Bias values include subtract 128 from U and V, bias from Y and rounding.
+// For B and R bias is negative. For G bias is positive.
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
+ {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
+ {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
+ 0, 0}}
+#else
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
+ {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+ {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
+#endif
+
+// clang-format on
+
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \
+ const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
+ YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \
+ const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
+ YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
+
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164 + V * 1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__) // 64 bit arm
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__) // 32 bit arm
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
+#define UB 129 /* round(2.018 * 64) */
#else
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.018 * 64)) */
#endif
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-// JPEG YUV to RGB reference
-// * R = Y - V * -1.40200
-// * G = Y - U * 0.34414 - V * 0.71414
-// * B = Y - U * -1.77200
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// * R = Y + V * 1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
+
+// U and V contributions to R,G,B.
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR 90 /* round(1.40200 * 64) */
// Y contribution to R,G,B. Scale and bias.
#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.709 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164 + V * 1.793
+// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+// B = (Y - 16) * 1.164 + U * 2.112
+// KR = 0.2126, KB = 0.0722
// U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414 * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
+#define UB 135 /* round(2.112 * 64) */
#else
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.112 * 64)) */
#endif
+#define UG 14 /* round(0.213 * 64) */
+#define VG 34 /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-// BT.709 YUV to RGB reference
-// R = (Y - 16) * 1.164 - V * -1.793
-// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
-// B = (Y - 16) * 1.164 - U * -2.112
-// See also http://www.equasys.de/colorconversion.html
+// BT.709 full range YUV to RGB reference
+// R = Y + V * 1.5748
+// G = Y - U * 0.18732 - V * 0.46812
+// B = Y + U * 1.8556
+// KR = 0.2126, KB = 0.0722
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// U and V contributions to R,G,B.
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12 /* round(0.18732 * 64) */
+#define VG 30 /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */
+
+// Y contribution to R,G,B. Scale and bias. (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.2020 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164384 + V * 1.67867
+// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+// B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593
-// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.112 * 64)) */
-#define UG 14 /* round(0.213 * 64) */
-#define VG 34 /* round(0.533 * 64) */
-#define VR -115 /* round(-1.793 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
+#define UB 137 /* round(2.142 * 64) */
#else
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.142 * 64)) */
#endif
+#define UG 12 /* round(0.187326 * 64) */
+#define VG 42 /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B. Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
+
+// BT.2020 full range YUV to RGB reference
+// R = Y + V * 1.474600
+// G = Y - U * 0.164553 - V * 0.571353
+// B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11 /* round(0.164553 * 64) */
+#define VG 37 /* round(0.571353 * 64) */
+#define VR 94 /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B. Scale and bias. (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
+
#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR
+
+#undef MAKEYUVCONSTANTS
+
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+#define LOAD_YUV_CONSTANTS \
+ int ub = yuvconstants->kUVCoeff[0]; \
+ int vr = yuvconstants->kUVCoeff[1]; \
+ int ug = yuvconstants->kUVCoeff[2]; \
+ int vg = yuvconstants->kUVCoeff[3]; \
+ int yg = yuvconstants->kRGBCoeffBias[0]; \
+ int bb = yuvconstants->kRGBCoeffBias[1]; \
+ int bg = yuvconstants->kRGBCoeffBias[2]; \
+ int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16 \
+ int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+ int b16 = y1 + (u * ub) - bb; \
+ int g16 = y1 + bg - (u * ug + v * vg); \
+ int r16 = y1 + (v * vr) - br
+#else
+#define LOAD_YUV_CONSTANTS \
+ int ub = yuvconstants->kUVToB[0]; \
+ int ug = yuvconstants->kUVToG[0]; \
+ int vg = yuvconstants->kUVToG[1]; \
+ int vr = yuvconstants->kUVToR[1]; \
+ int yg = yuvconstants->kYToRgb[0]; \
+ int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16 \
+ int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+ int8_t ui = (int8_t)u; \
+ int8_t vi = (int8_t)v; \
+ ui -= 0x80; \
+ vi -= 0x80; \
+ int b16 = y1 + (ui * ub); \
+ int g16 = y1 - (ui * ug + vi * vg); \
+ int r16 = y1 + (vi * vr)
+#endif
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 16 bit.
-
static __inline void YuvPixel(uint8_t y,
uint8_t u,
uint8_t v,
@@ -1316,39 +1804,12 @@ static __inline void YuvPixel(uint8_t y,
uint8_t* g,
uint8_t* r,
const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
-
- uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
- *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
- *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
- *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y * 0x0101;
+ CALC_RGB16;
+ *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+ *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+ *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
}
// Reads 8 bit YUV and leaves result as 16 bit.
@@ -1359,85 +1820,50 @@ static __inline void YuvPixel8_16(uint8_t y,
int* g,
int* r,
const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
-
- uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
- *b = (int)(-(u * ub) + y1 + bb);
- *g = (int)(-(u * ug + v * vg) + y1 + bg);
- *r = (int)(-(v * vr) + y1 + br);
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y * 0x0101;
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
}
// C reference code that mimics the YUV 16 bit assembly.
// Reads 10 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel16(int16_t y,
- int16_t u,
- int16_t v,
- int* b,
- int* g,
- int* r,
- const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
+static __inline void YuvPixel10_16(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = (y << 6) | (y >> 4);
+ u = STATIC_CAST(uint8_t, clamp255(u >> 2));
+ v = STATIC_CAST(uint8_t, clamp255(v >> 2));
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
+}
- uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
- u = clamp255(u >> 2);
- v = clamp255(v >> 2);
- *b = (int)(-(u * ub) + y1 + bb);
- *g = (int)(-(u * ug + v * vg) + y1 + bg);
- *r = (int)(-(v * vr) + y1 + br);
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 12 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel12_16(int16_t y,
+ int16_t u,
+ int16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = (y << 4) | (y >> 8);
+ u = STATIC_CAST(uint8_t, clamp255(u >> 4));
+ v = STATIC_CAST(uint8_t, clamp255(v >> 4));
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
}
// C reference code that mimics the YUV 10 bit assembly.
@@ -1452,31 +1878,89 @@ static __inline void YuvPixel10(uint16_t y,
int b16;
int g16;
int r16;
- YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
- *b = Clamp(b16 >> 6);
- *g = Clamp(g16 >> 6);
- *r = Clamp(r16 >> 6);
+ YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+ *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+ *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
}
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// C reference code that mimics the YUV 12 bit assembly.
+// Reads 12 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel12(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ int b16;
+ int g16;
+ int r16;
+ YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+ *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+ *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
+}
-// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
- uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
- *b = Clamp((int32_t)(y1 + YGB) >> 6);
- *g = Clamp((int32_t)(y1 + YGB) >> 6);
- *r = Clamp((int32_t)(y1 + YGB) >> 6);
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 8 bit.
+static __inline void YuvPixel16_8(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y;
+ u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+ v = STATIC_CAST(uint16_t, clamp255(v >> 8));
+ CALC_RGB16;
+ *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+ *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+ *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
}
-#undef YG
-#undef YGB
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16_16(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y;
+ u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+ v = STATIC_CAST(uint16_t, clamp255(v >> 8));
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
+}
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 8 bit.
+static __inline void YPixel(uint8_t y,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+ int yg = yuvconstants->kRGBCoeffBias[0];
+ int ygb = yuvconstants->kRGBCoeffBias[4];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ uint8_t b8 = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+ *b = b8;
+ *g = b8;
+ *r = b8;
+}
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
void I444ToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -1484,45 +1968,33 @@ void I444ToARGBRow_C(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
- uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
- YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
- yuvconstants);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
- yuvconstants);
- rgb_buf[7] = 255;
- src_y += 2;
- src_u += 2;
- src_v += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
+ for (x = 0; x < width; ++x) {
YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
}
}
-#else
-void I444ToARGBRow_C(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
+
+void I444ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
rgb_buf + 2, yuvconstants);
- rgb_buf[3] = 255;
src_y += 1;
src_u += 1;
src_v += 1;
- rgb_buf += 4; // Advance 1 pixel.
+ rgb_buf += 3; // Advance 1 pixel.
}
}
-#endif
// Also used for 420
void I422ToARGBRow_C(const uint8_t* src_y,
@@ -1578,9 +2050,102 @@ void I210ToARGBRow_C(const uint16_t* src_y,
}
}
+void I410ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixels.
+ }
+}
+
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
+ YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = STATIC_CAST(uint8_t, clamp255(src_a[1] >> 2));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ src_a += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
+ }
+}
+
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ src_a += 1;
+ rgb_buf += 4; // Advance 1 pixels.
+ }
+}
+
+// 12 bit YUV to ARGB
+void I212ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
uint32_t ar30;
- b = b >> 4; // convert 10.6 to 10 bit.
+ b = b >> 4; // convert 8 bit 10.6 to 10 bit.
g = g >> 4;
r = r >> 4;
b = Clamp10(b);
@@ -1602,9 +2167,9 @@ void I210ToAR30Row_C(const uint16_t* src_y,
int g;
int r;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
- YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf + 4, b, g, r);
src_y += 2;
src_u += 1;
@@ -1612,11 +2177,141 @@ void I210ToAR30Row_C(const uint16_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
}
}
+// 12 bit YUV to 10 bit AR30
+void I212ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
+void I410ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width; ++x) {
+ YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+
+// P210 has 10 bits in msb of 16 bit NV12 style layout.
+void P210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+ dst_argb + 2, yuvconstants);
+ dst_argb[3] = 255;
+ YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
+ dst_argb + 6, yuvconstants);
+ dst_argb[7] = 255;
+ src_y += 2;
+ src_uv += 2;
+ dst_argb += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+ dst_argb + 2, yuvconstants);
+ dst_argb[3] = 255;
+ }
+}
+
+void P410ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+ dst_argb + 2, yuvconstants);
+ dst_argb[3] = 255;
+ src_y += 1;
+ src_uv += 2;
+ dst_argb += 4; // Advance 1 pixels.
+ }
+}
+
+void P210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30, b, g, r);
+ YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30 + 4, b, g, r);
+ src_y += 2;
+ src_uv += 2;
+ dst_ar30 += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30, b, g, r);
+ }
+}
+
+void P410ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width; ++x) {
+ YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30, b, g, r);
+ src_y += 1;
+ src_uv += 2;
+ dst_ar30 += 4; // Advance 1 pixel.
+ }
+}
+
// 8 bit YUV to 10 bit AR30
// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
void I422ToAR30Row_C(const uint8_t* src_y,
@@ -1645,6 +2340,26 @@ void I422ToAR30Row_C(const uint8_t* src_y,
}
}
+void I444AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ src_a += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+
void I422AlphaToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -1718,8 +2433,10 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
b1 = b1 >> 4;
g1 = g1 >> 4;
r1 = r1 >> 4;
- *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
- (g1 << 20) | (r1 << 24) | 0xf000f000;
+ *(uint16_t*)(dst_argb4444 + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
+ *(uint16_t*)(dst_argb4444 + 2) =
+ STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | 0xf000);
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1730,7 +2447,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
- *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+ *(uint16_t*)(dst_argb4444) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
}
}
@@ -1756,8 +2474,10 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
b1 = b1 >> 3;
g1 = g1 >> 3;
r1 = r1 >> 3;
- *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
- (g1 << 21) | (r1 << 26) | 0x80008000;
+ *(uint16_t*)(dst_argb1555 + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
+ *(uint16_t*)(dst_argb1555 + 2) =
+ STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | 0x8000);
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1768,7 +2488,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
- *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+ *(uint16_t*)(dst_argb1555) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
}
}
@@ -1794,8 +2515,10 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32_t*)(dst_rgb565) =
- b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ *(uint16_t*)(dst_rgb565 + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+ *(uint16_t*)(dst_rgb565 + 2) =
+ STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1806,7 +2529,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
- *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb565 + 0) =
+ STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
}
}
@@ -1921,8 +2645,12 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32_t*)(dst_rgb565) =
- b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ *(uint16_t*)(dst_rgb565 + 0) = STATIC_CAST(uint16_t, b0) |
+ STATIC_CAST(uint16_t, g0 << 5) |
+ STATIC_CAST(uint16_t, r0 << 11);
+ *(uint16_t*)(dst_rgb565 + 2) = STATIC_CAST(uint16_t, b1) |
+ STATIC_CAST(uint16_t, g1 << 5) |
+ STATIC_CAST(uint16_t, r1 << 11);
src_y += 2;
src_uv += 2;
dst_rgb565 += 4; // Advance 2 pixels.
@@ -1932,7 +2660,9 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
- *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb565) = STATIC_CAST(uint16_t, b0) |
+ STATIC_CAST(uint16_t, g0 << 5) |
+ STATIC_CAST(uint16_t, r0 << 11);
}
}
@@ -2006,18 +2736,21 @@ void I422ToRGBARow_C(const uint8_t* src_y,
}
}
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2035,10 +2768,34 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width) {
+ int x;
+ src += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[x] = src[0];
+ dst[x + 1] = src[-1];
+ src -= 2;
+ }
+ if (width & 1) {
+ dst[width - 1] = src[0];
+ }
+}
+
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_uv[0];
+ dst_uv[1] = src_uv[1];
+ src_uv -= 2;
+ dst_uv += 2;
+ }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width - 1; x += 2) {
@@ -2069,6 +2826,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+ int x;
+ src_rgb24 += width * 3 - 3;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ src_rgb24 -= 3;
+ dst_rgb24 += 3;
+ }
+}
+
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -2105,6 +2877,98 @@ void MergeUVRow_C(const uint8_t* src_u,
}
}
+void DetileRow_C(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ int x;
+ for (x = 0; x < width - 15; x += 16) {
+ memcpy(dst, src, 16);
+ dst += 16;
+ src += src_tile_stride;
+ }
+ if (width & 15) {
+ memcpy(dst, src, width & 15);
+ }
+}
+
+void DetileRow_16_C(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ int x;
+ for (x = 0; x < width - 15; x += 16) {
+ memcpy(dst, src, 16 * sizeof(uint16_t));
+ dst += 16;
+ src += src_tile_stride;
+ }
+ if (width & 15) {
+ memcpy(dst, src, (width & 15) * sizeof(uint16_t));
+ }
+}
+
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ for (x = 0; x < width - 15; x += 16) {
+ SplitUVRow_C(src_uv, dst_u, dst_v, 8);
+ dst_u += 8;
+ dst_v += 8;
+ src_uv += src_tile_stride;
+ }
+ if (width & 15) {
+ SplitUVRow_C(src_uv, dst_u, dst_v, ((width & 15) + 1) / 2);
+ }
+}
+
+void DetileToYUY2_C(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ for (int x = 0; x < width - 15; x += 16) {
+ for (int i = 0; i < 8; i++) {
+ dst_yuy2[0] = src_y[0];
+ dst_yuy2[1] = src_uv[0];
+ dst_yuy2[2] = src_y[1];
+ dst_yuy2[3] = src_uv[1];
+ dst_yuy2 += 4;
+ src_y += 2;
+ src_uv += 2;
+ }
+ src_y += src_y_tile_stride - 16;
+ src_uv += src_uv_tile_stride - 16;
+ }
+}
+
+// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
+// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
+// block contain all of the lower 2 bits of each pixel packed together, and the
+// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
+// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
+// order.
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
+ for (size_t i = 0; i < size; i += 80) {
+ const uint8_t* src_lower_bits = src;
+ const uint8_t* src_upper_bits = src + 16;
+
+ for (int j = 0; j < 4; j++) {
+ for (int k = 0; k < 16; k++) {
+ *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
+ (uint16_t)*src_upper_bits << 8 |
+ (uint16_t)*src_upper_bits >> 2;
+ src_upper_bits++;
+ }
+ }
+
+ src += 80;
+ }
+}
+
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
@@ -2133,27 +2997,197 @@ void MergeRGBRow_C(const uint8_t* src_r,
}
}
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
+void SplitARGBRow_C(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_b[x] = src_argb[0];
+ dst_g[x] = src_argb[1];
+ dst_r[x] = src_argb[2];
+ dst_a[x] = src_argb[3];
+ src_argb += 4;
+ }
+}
+
+void MergeARGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = src_b[x];
+ dst_argb[1] = src_g[x];
+ dst_argb[2] = src_r[x];
+ dst_argb[3] = src_a[x];
+ dst_argb += 4;
+ }
+}
+
+void MergeXR30Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ assert(depth >= 10);
+ assert(depth <= 16);
+ int x;
+ int shift = depth - 10;
+ uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
+ for (x = 0; x < width; ++x) {
+ uint32_t r = clamp1023(src_r[x] >> shift);
+ uint32_t g = clamp1023(src_g[x] >> shift);
+ uint32_t b = clamp1023(src_b[x] >> shift);
+ dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
+ }
+}
+
+void MergeAR64Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ assert(depth >= 1);
+ assert(depth <= 16);
+ int x;
+ int shift = 16 - depth;
+ int max = (1 << depth) - 1;
+ for (x = 0; x < width; ++x) {
+ dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+ dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+ dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
+ dst_ar64[3] = STATIC_CAST(uint16_t, ClampMax(src_a[x], max) << shift);
+ dst_ar64 += 4;
+ }
+}
+
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ assert(depth >= 8);
+ assert(depth <= 16);
+ int x;
+ int shift = depth - 8;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+ dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+ dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
+ dst_argb[3] = STATIC_CAST(uint8_t, clamp255(src_a[x] >> shift));
+ dst_argb += 4;
+ }
+}
+
+void MergeXR64Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ assert(depth >= 1);
+ assert(depth <= 16);
+ int x;
+ int shift = 16 - depth;
+ int max = (1 << depth) - 1;
+ for (x = 0; x < width; ++x) {
+ dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+ dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+ dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
+ dst_ar64[3] = 0xffff;
+ dst_ar64 += 4;
+ }
+}
+
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ assert(depth >= 8);
+ assert(depth <= 16);
+ int x;
+ int shift = depth - 8;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+ dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+ dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
+ dst_argb[3] = 0xff;
+ dst_argb += 4;
+ }
+}
+
+void SplitXRGBRow_C(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_b[x] = src_argb[0];
+ dst_g[x] = src_argb[1];
+ dst_r[x] = src_argb[2];
+ src_argb += 4;
+ }
+}
+
+void MergeXRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = src_b[x];
+ dst_argb[1] = src_g[x];
+ dst_argb[2] = src_r[x];
+ dst_argb[3] = 255;
+ dst_argb += 4;
+ }
+}
+
+// Convert lsb formats to msb, depending on sample depth.
void MergeUVRow_16_C(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
- int scale,
+ int depth,
int width) {
+ int shift = 16 - depth;
+ assert(depth >= 8);
+ assert(depth <= 16);
int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_uv[0] = src_u[x] * scale;
- dst_uv[1] = src_v[x] * scale;
- dst_uv[2] = src_u[x + 1] * scale;
- dst_uv[3] = src_v[x + 1] * scale;
- dst_uv += 4;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = STATIC_CAST(uint16_t, src_u[x] << shift);
+ dst_uv[1] = STATIC_CAST(uint16_t, src_v[x] << shift);
+ dst_uv += 2;
}
- if (width & 1) {
- dst_uv[0] = src_u[width - 1] * scale;
- dst_uv[1] = src_v[width - 1] * scale;
+}
+
+// Convert msb formats to lsb, depending on sample depth.
+void SplitUVRow_16_C(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int x;
+ assert(depth >= 8);
+ assert(depth <= 16);
+ for (x = 0; x < width; ++x) {
+ dst_u[x] = src_uv[0] >> shift;
+ dst_v[x] = src_uv[1] >> shift;
+ src_uv += 2;
}
}
@@ -2163,7 +3197,17 @@ void MultiplyRow_16_C(const uint16_t* src_y,
int width) {
int x;
for (x = 0; x < width; ++x) {
- dst_y[x] = src_y[x] * scale;
+ dst_y[x] = STATIC_CAST(uint16_t, src_y[x] * scale);
+ }
+}
+
+void DivideRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = (src_y[x] * scale) >> 16;
}
}
@@ -2172,13 +3216,19 @@ void MultiplyRow_16_C(const uint16_t* src_y,
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
void Convert16To8Row_C(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
int x;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+
for (x = 0; x < width; ++x) {
- dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+ dst_y[x] = STATIC_CAST(uint8_t, C16TO8(src_y[x], scale));
}
}
@@ -2208,10 +3258,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
}
void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
- uint32_t* d = (uint32_t*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
- d[x] = v32;
+ memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
}
}
@@ -2232,6 +3281,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
}
}
+// Filter 2 rows of YUY2 UV's (422) into UV (NV12).
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ // Output a row of UV values, filtering 2 rows of YUY2.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+ dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+ src_yuy2 += 4;
+ dst_uv += 2;
+ }
+}
+
// Copy row of YUY2 UV's (422) into U and V (422).
void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
uint8_t* dst_u,
@@ -2309,56 +3373,56 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
}
}
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint32_t fb = src_argb0[0];
- uint32_t fg = src_argb0[1];
- uint32_t fr = src_argb0[2];
- uint32_t a = src_argb0[3];
+ uint32_t fb = src_argb[0];
+ uint32_t fg = src_argb[1];
+ uint32_t fr = src_argb[2];
+ uint32_t a = src_argb[3];
uint32_t bb = src_argb1[0];
uint32_t bg = src_argb1[1];
uint32_t br = src_argb1[2];
- dst_argb[0] = BLEND(fb, bb, a);
- dst_argb[1] = BLEND(fg, bg, a);
- dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+ dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+ dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
dst_argb[3] = 255u;
- fb = src_argb0[4 + 0];
- fg = src_argb0[4 + 1];
- fr = src_argb0[4 + 2];
- a = src_argb0[4 + 3];
+ fb = src_argb[4 + 0];
+ fg = src_argb[4 + 1];
+ fr = src_argb[4 + 2];
+ a = src_argb[4 + 3];
bb = src_argb1[4 + 0];
bg = src_argb1[4 + 1];
br = src_argb1[4 + 2];
- dst_argb[4 + 0] = BLEND(fb, bb, a);
- dst_argb[4 + 1] = BLEND(fg, bg, a);
- dst_argb[4 + 2] = BLEND(fr, br, a);
+ dst_argb[4 + 0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+ dst_argb[4 + 1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+ dst_argb[4 + 2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
dst_argb[4 + 3] = 255u;
- src_argb0 += 8;
+ src_argb += 8;
src_argb1 += 8;
dst_argb += 8;
}
if (width & 1) {
- uint32_t fb = src_argb0[0];
- uint32_t fg = src_argb0[1];
- uint32_t fr = src_argb0[2];
- uint32_t a = src_argb0[3];
+ uint32_t fb = src_argb[0];
+ uint32_t fg = src_argb[1];
+ uint32_t fr = src_argb[2];
+ uint32_t a = src_argb[3];
uint32_t bb = src_argb1[0];
uint32_t bg = src_argb1[1];
uint32_t br = src_argb1[2];
- dst_argb[0] = BLEND(fb, bb, a);
- dst_argb[1] = BLEND(fg, bg, a);
- dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+ dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+ dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
dst_argb[3] = 255u;
}
}
@@ -2385,10 +3449,9 @@ void BlendPlaneRow_C(const uint8_t* src0,
}
#undef UBLEND
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#define ATTENUATE(f, a) (f * a + 255) >> 8
// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
@@ -2399,7 +3462,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
dst_argb[0] = ATTENUATE(b, a);
dst_argb[1] = ATTENUATE(g, a);
dst_argb[2] = ATTENUATE(r, a);
- dst_argb[3] = a;
+ dst_argb[3] = STATIC_CAST(uint8_t, a);
b = src_argb[4];
g = src_argb[5];
r = src_argb[6];
@@ -2407,7 +3470,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
dst_argb[4] = ATTENUATE(b, a);
dst_argb[5] = ATTENUATE(g, a);
dst_argb[6] = ATTENUATE(r, a);
- dst_argb[7] = a;
+ dst_argb[7] = STATIC_CAST(uint8_t, a);
src_argb += 8;
dst_argb += 8;
}
@@ -2420,7 +3483,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
dst_argb[0] = ATTENUATE(b, a);
dst_argb[1] = ATTENUATE(g, a);
dst_argb[2] = ATTENUATE(r, a);
- dst_argb[3] = a;
+ dst_argb[3] = STATIC_CAST(uint8_t, a);
}
}
#undef ATTENUATE
@@ -2472,6 +3535,14 @@ const uint32_t fixed_invtbl8[256] = {
T(0xfc), T(0xfd), T(0xfe), 0x01000100};
#undef T
+#if LIBYUV_UNATTENUATE_DUP
+// This code mimics the Intel SIMD version for better testability.
+#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
+#else
+#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
+#endif
+
+// mimics the Intel SIMD code for exactness.
void ARGBUnattenuateRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
@@ -2482,14 +3553,12 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb,
uint32_t r = src_argb[2];
const uint32_t a = src_argb[3];
const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
- b = (b * ia) >> 8;
- g = (g * ia) >> 8;
- r = (r * ia) >> 8;
+
// Clamping should not be necessary but is free in assembly.
- dst_argb[0] = clamp255(b);
- dst_argb[1] = clamp255(g);
- dst_argb[2] = clamp255(r);
- dst_argb[3] = a;
+ dst_argb[0] = STATIC_CAST(uint8_t, UNATTENUATE(b, ia));
+ dst_argb[1] = STATIC_CAST(uint8_t, UNATTENUATE(g, ia));
+ dst_argb[2] = STATIC_CAST(uint8_t, UNATTENUATE(r, ia));
+ dst_argb[3] = STATIC_CAST(uint8_t, a);
src_argb += 4;
dst_argb += 4;
}
@@ -2519,13 +3588,24 @@ void CumulativeSumToAverageRow_C(const int32_t* tl,
int area,
uint8_t* dst,
int count) {
- float ooa = 1.0f / area;
+ float ooa;
int i;
+ assert(area != 0);
+
+ ooa = 1.0f / STATIC_CAST(float, area);
for (i = 0; i < count; ++i) {
- dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
- dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
- dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
- dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst[0] =
+ (uint8_t)(STATIC_CAST(float, bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) *
+ ooa);
+ dst[1] =
+ (uint8_t)(STATIC_CAST(float, bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) *
+ ooa);
+ dst[2] =
+ (uint8_t)(STATIC_CAST(float, bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) *
+ ooa);
+ dst[3] =
+ (uint8_t)(STATIC_CAST(float, bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) *
+ ooa);
dst += 4;
tl += 4;
bl += 4;
@@ -2576,6 +3656,19 @@ static void HalfRow_16_C(const uint16_t* src_uv,
}
}
+static void HalfRow_16To8_C(const uint16_t* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint8_t* dst_uv,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = STATIC_CAST(
+ uint8_t,
+ C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale));
+ }
+}
+
// C version 2x2 -> 2x1.
void InterpolateRow_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -2586,6 +3679,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width);
return;
@@ -2594,21 +3690,17 @@ void InterpolateRow_C(uint8_t* dst_ptr,
HalfRow_C(src_ptr, src_stride, dst_ptr, width);
return;
}
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] =
- (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
- dst_ptr[1] =
- (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
- }
- if (width & 1) {
- dst_ptr[0] =
- (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ for (x = 0; x < width; ++x) {
+ dst_ptr[0] = STATIC_CAST(
+ uint8_t,
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
+ ++src_ptr;
+ ++src_ptr1;
+ ++dst_ptr;
}
}
+// C version 2x2 -> 2x1.
void InterpolateRow_16_C(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
@@ -2618,23 +3710,65 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
- if (source_y_fraction == 0) {
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
+ if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width * 2);
return;
}
- if (source_y_fraction == 128) {
+ if (y1_fraction == 128) {
HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
return;
}
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
+ for (x = 0; x < width; ++x) {
+ dst_ptr[0] = STATIC_CAST(
+ uint16_t,
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
+ ++src_ptr;
+ ++src_ptr1;
+ ++dst_ptr;
}
- if (width & 1) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+}
+
+// C version 2x2 16 bit-> 2x1 8 bit.
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+
+void InterpolateRow_16To8_C(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
+ if (source_y_fraction == 0) {
+ Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
+ return;
+ }
+ for (x = 0; x < width; ++x) {
+ dst_ptr[0] = STATIC_CAST(
+ uint8_t,
+ C16TO8(
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+ scale));
+ src_ptr += 1;
+ src_ptr1 += 1;
+ dst_ptr += 1;
}
}
@@ -2743,10 +3877,10 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb,
dr += poly[14] * r3;
da += poly[15] * a3;
- dst_argb[0] = Clamp((int32_t)(db));
- dst_argb[1] = Clamp((int32_t)(dg));
- dst_argb[2] = Clamp((int32_t)(dr));
- dst_argb[3] = Clamp((int32_t)(da));
+ dst_argb[0] = STATIC_CAST(uint8_t, Clamp((int32_t)(db)));
+ dst_argb[1] = STATIC_CAST(uint8_t, Clamp((int32_t)(dg)));
+ dst_argb[2] = STATIC_CAST(uint8_t, Clamp((int32_t)(dr)));
+ dst_argb[3] = STATIC_CAST(uint8_t, Clamp((int32_t)(da)));
src_argb += 4;
dst_argb += 4;
}
@@ -2873,7 +4007,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
// Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
@@ -3151,6 +4285,32 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
}
#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+void I444ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth;
+ src_v += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
#if defined(HAS_NV12TORGB565ROW_AVX2)
void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
const uint8_t* src_uv,
@@ -3175,12 +4335,93 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
}
#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction) {
+ // Row buffer for intermediate 16 bit pixels.
+ SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
+ Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
+ src_ptr += twidth;
+ dst_ptr += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_INTERPOLATEROW_16TO8_AVX2
+
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
-#if defined(__clang__)
-#pragma clang loop vectorize_width(4)
-#endif
for (i = 0; i < width; ++i) {
float v = *src++;
fsum += v * v;
@@ -3211,8 +4452,9 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
- *dst++ =
- (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+ *dst++ = STATIC_CAST(
+ uint16_t,
+ (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8);
++src;
}
}
@@ -3231,6 +4473,29 @@ void GaussCol_C(const uint16_t* src0,
}
}
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+ (1.0f / 256.0f);
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
// Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_C(const uint8_t* src_y,
const uint8_t* src_vu,
@@ -3256,13 +4521,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
}
// Filter 2 rows of AYUV UV's (444) into UV (420).
+// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
void AYUVToUVRow_C(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
// Output a row of UV values, filtering 2x2 rows of AYUV.
int x;
- for (x = 0; x < width; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
src_ayuv[src_stride_ayuv + 5] + 2) >>
2;
@@ -3273,12 +4539,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv,
dst_uv += 2;
}
if (width & 1) {
- dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
- src_ayuv[src_stride_ayuv + 0] + 2) >>
- 2;
- dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
- src_ayuv[src_stride_ayuv + 1] + 2) >>
- 2;
+ dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
+ dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
}
}
@@ -3289,7 +4551,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
int width) {
// Output a row of VU values, filtering 2x2 rows of AYUV.
int x;
- for (x = 0; x < width; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
src_ayuv[src_stride_ayuv + 4] + 2) >>
2;
@@ -3300,12 +4562,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
dst_vu += 2;
}
if (width & 1) {
- dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
- src_ayuv[src_stride_ayuv + 0] + 2) >>
- 2;
- dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
- src_ayuv[src_stride_ayuv + 1] + 2) >>
- 2;
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
}
}
@@ -3319,7 +4577,8 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
}
}
-void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
int x;
for (x = 0; x < width; ++x) {
uint8_t u = src_uv[0];
@@ -3331,19 +4590,32 @@ void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
}
}
-// divide values by weights and provide mask to indicate weight of 0.
-void FloatDivToByteRow_C(const float* src_weights,
- const float* src_values,
- uint8_t* dst_out,
- uint8_t* dst_mask,
- int width) {
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
int x;
- for (x = 0; x < width; ++x) {
- dst_out[x] = Clamp(src_values[x] / src_weights[x]);
- dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+ src_u[src_stride_u + 1] + 2) >>
+ 2;
+ dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+ src_v[src_stride_v + 1] + 2) >>
+ 2;
+ src_u += 2;
+ src_v += 2;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+ dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
}
}
+#undef STATIC_CAST
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
new file mode 100644
index 00000000..d8074987
--- /dev/null
+++ b/source/row_gcc.cc
@@ -0,0 +1,9744 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+ 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
+
+// JPeg full range.
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+ 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
+ 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+ 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
+ -43, -84, 127, 0, -43, -84, 127, 0};
+
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+ -18, -94, 112, 0, -18, -94, 112, 0};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
+
+static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
+ 127, -107, -20, 0, 127, -107, -20, 0};
+
+// Constants for BGRA
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+ 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+ 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+ 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+ 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
+ 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
+ "psrld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGBA) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGB24_0), // %3
+ "m"(kShuffleMaskRAWToRGB24_1), // %4
+ "m"(kShuffleMaskRAWToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm6 \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24), // %3
+ "m"(kPermdRGB24_AVX) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
+ 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+ 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+ 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+ 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+ 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+ 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+ 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+ 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kPermARGBToRGB24_0), // %3
+ "m"(kPermARGBToRGB24_1), // %4
+ "m"(kPermARGBToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm6 \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW), // %3
+ "m"(kPermdRGB24_AVX) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ uint32_t dither4,
+ int width) {
+ asm volatile(
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ uint32_t dither4,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3,%%xmm6 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
+would be a simple multiplier to shift it into position. It wants a gap of 10
+above the green. Green is 10 bits, so there are 6 bits in the low short. 4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
+ 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
+ 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleRB30), // %3
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleBR30), // %3 reversed shuffler
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleRB30), // %3
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleBR30), // %3 reversed shuffler
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
+ 6, 6, 5, 5, 4, 4, 7, 7};
+static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11,
+ 14, 14, 13, 13, 12, 12, 15, 15};
+
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm2 \n"
+ "movdqa %4,%%xmm3 \n" LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToAB64Lo), // %3
+ "m"(kShuffleARGBToAB64Hi) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrlw $8,%%xmm0 \n"
+ "psrlw $8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm2 \n" LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrlw $8,%%xmm0 \n"
+ "psrlw $8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToABGR) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm2 \n"
+ "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm1 \n"
+ "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToAB64Lo), // %3
+ "m"(kShuffleARGBToAB64Hi) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpsrlw $8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x40(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_AVX2
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpsrlw $8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x40(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToABGR) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round) \
+ "1: \n" \
+ "movdqu (%0),%%xmm0 \n" \
+ "movdqu 0x10(%0),%%xmm1 \n" \
+ "movdqu 0x20(%0),%%xmm2 \n" \
+ "movdqu 0x30(%0),%%xmm3 \n" \
+ "psubb %%xmm5,%%xmm0 \n" \
+ "psubb %%xmm5,%%xmm1 \n" \
+ "psubb %%xmm5,%%xmm2 \n" \
+ "psubb %%xmm5,%%xmm3 \n" \
+ "movdqu %%xmm4,%%xmm6 \n" \
+ "pmaddubsw %%xmm0,%%xmm6 \n" \
+ "movdqu %%xmm4,%%xmm0 \n" \
+ "pmaddubsw %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm4,%%xmm1 \n" \
+ "pmaddubsw %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm4,%%xmm2 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "lea 0x40(%0),%0 \n" \
+ "phaddw %%xmm0,%%xmm6 \n" \
+ "phaddw %%xmm2,%%xmm1 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "paddw %%" #round ",%%xmm6 \n" \
+ "paddw %%" #round ",%%xmm1 \n" \
+ "psrlw $0x8,%%xmm6 \n" \
+ "psrlw $0x8,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm6 \n" \
+ "movdqu %%xmm6,(%1) \n" \
+ "lea 0x10(%1),%1 \n" \
+ "sub $0x10,%2 \n" \
+ "jg 1b \n"
+
+#define RGBTOY_AVX2(round) \
+ "1: \n" \
+ "vmovdqu (%0),%%ymm0 \n" \
+ "vmovdqu 0x20(%0),%%ymm1 \n" \
+ "vmovdqu 0x40(%0),%%ymm2 \n" \
+ "vmovdqu 0x60(%0),%%ymm3 \n" \
+ "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
+ "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
+ "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
+ "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
+ "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
+ "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
+ "lea 0x80(%0),%0 \n" \
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
+ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
+ "vmovdqu %%ymm0,(%1) \n" \
+ "lea 0x20(%1),%1 \n" \
+ "sub $0x20,%2 \n" \
+ "jg 1b \n" \
+ "vzeroupper \n"
+
+// clang-format on
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ABGRTOYJROW_SSSE3
+// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
+// Same as ABGRToYRow but different coefficients, no add 16.
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ABGRTOYJROW_SSSE3
+
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_SSSE3
+
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
+ defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+#endif
+
+#ifdef HAS_ARGBTOYROW_AVX2
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm7) "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm7) "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ABGRTOYJROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOYJROW_AVX2
+
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToV), // %5
+ "m"(kARGBToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif // HAS_ARGBTOUVROW_SSSE3
+
+#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
+ defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#endif
+
+#if defined(HAS_ARGBTOUVROW_AVX2)
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kARGBToV), // %6
+ "m"(kARGBToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kABGRToV), // %6
+ "m"(kABGRToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kSub128), // %5
+ "m"(kARGBToVJ), // %6
+ "m"(kARGBToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
+#ifdef HAS_ABGRTOUVJROW_AVX2
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kSub128), // %5
+ "m"(kABGRToVJ), // %6
+ "m"(kABGRToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToVJ), // %5
+ "m"(kARGBToUJ), // %6
+ "m"(kSub128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToVJ), // %5
+ "m"(kABGRToUJ), // %6
+ "m"(kSub128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif // HAS_ABGRTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
+}
+#endif // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)), // %4
+ "m"(kBGRAToV), // %5
+ "m"(kBGRAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToV), // %5
+ "m"(kABGRToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba)), // %4
+ "m"(kRGBAToV), // %5
+ "m"(kRGBAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd (%[u_buf]),%%xmm3 \n" \
+ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+#define READYUV210 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm3 \n" \
+ "psraw $2,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+#define READYUVA210 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm3 \n" \
+ "psraw $2,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "movdqu (%[a_buf]),%%xmm5 \n" \
+ "psraw $2,%%xmm5 \n" \
+ "packuswb %%xmm5,%%xmm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from 444 10 bit
+#define READYUV410 \
+ "movdqu (%[u_buf]),%%xmm3 \n" \
+ "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "psraw $2,%%xmm3 \n" \
+ "psraw $2,%%xmm2 \n" \
+ "movdqa %%xmm3,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm3 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 444 10 bit. With 8 Alpha.
+#define READYUVA410 \
+ "movdqu (%[u_buf]),%%xmm3 \n" \
+ "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "psraw $2,%%xmm3 \n" \
+ "psraw $2,%%xmm2 \n" \
+ "movdqa %%xmm3,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm3 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "psrlw $4,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "movdqu (%[a_buf]),%%xmm5 \n" \
+ "psraw $2,%%xmm5 \n" \
+ "packuswb %%xmm5,%%xmm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 4 UV from 422 12 bit, upsample to 8 UV
+#define READYUV212 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm3 \n" \
+ "psraw $0x4,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm2 \n" \
+ "psllw $4,%%xmm4 \n" \
+ "psrlw $8,%%xmm2 \n" \
+ "paddw %%xmm2,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ "movd (%[u_buf]),%%xmm3 \n" \
+ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "movq (%[a_buf]),%%xmm5 \n" \
+ "lea 0x8(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from 444. With 8 Alpha.
+#define READYUVA444 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "movq (%[a_buf]),%%xmm5 \n" \
+ "lea 0x8(%[a_buf]),%[a_buf] \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq (%[uv_buf]),%%xmm3 \n" \
+ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21 \
+ "movq (%[vu_buf]),%%xmm3 \n" \
+ "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
+ "pshufb %[kShuffleNV21], %%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2 \
+ "movdqu (%[yuy2_buf]),%%xmm4 \n" \
+ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
+ "movdqu (%[yuy2_buf]),%%xmm3 \n" \
+ "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \
+ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY \
+ "movdqu (%[uyvy_buf]),%%xmm4 \n" \
+ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
+ "movdqu (%[uyvy_buf]),%%xmm3 \n" \
+ "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \
+ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
+
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210 \
+ "movdqu (%[uv_buf]),%%xmm3 \n" \
+ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
+ "psrlw $0x8,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from P410
+#define READP410 \
+ "movdqu (%[uv_buf]),%%xmm3 \n" \
+ "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \
+ "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
+ "psrlw $0x8,%%xmm3 \n" \
+ "psrlw $0x8,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants) \
+ "pcmpeqb %%xmm13,%%xmm13 \n" \
+ "movdqa (%[yuvconstants]),%%xmm8 \n" \
+ "pxor %%xmm12,%%xmm12 \n" \
+ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
+ "psllw $7,%%xmm13 \n" \
+ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
+ "pshufb %%xmm12,%%xmm13 \n" \
+ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm12 \n"
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants) \
+ "psubb %%xmm13,%%xmm3 \n" \
+ "pmulhuw %%xmm11,%%xmm4 \n" \
+ "movdqa %%xmm8,%%xmm0 \n" \
+ "movdqa %%xmm9,%%xmm1 \n" \
+ "movdqa %%xmm10,%%xmm2 \n" \
+ "paddw %%xmm12,%%xmm4 \n" \
+ "pmaddubsw %%xmm3,%%xmm0 \n" \
+ "pmaddubsw %%xmm3,%%xmm1 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psubsw %%xmm1,%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm1 \n"
+
+#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants) \
+ "pcmpeqb %%xmm0,%%xmm0 \n" \
+ "pxor %%xmm1,%%xmm1 \n" \
+ "psllw $7,%%xmm0 \n" \
+ "pshufb %%xmm1,%%xmm0 \n" \
+ "psubb %%xmm0,%%xmm3 \n" \
+ "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
+ "movdqa (%[yuvconstants]),%%xmm0 \n" \
+ "movdqa 32(%[yuvconstants]),%%xmm1 \n" \
+ "movdqa 64(%[yuvconstants]),%%xmm2 \n" \
+ "pmaddubsw %%xmm3,%%xmm0 \n" \
+ "pmaddubsw %%xmm3,%%xmm1 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm3 \n" \
+ "paddw %%xmm3,%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psubsw %%xmm1,%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm1 \n"
+
+#define YUVTORGB_REGS
+#endif
+
+#define YUVTORGB(yuvconstants) \
+ YUVTORGB16(yuvconstants) \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm5,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm0,(%[dst_argb]) \n" \
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
+ "lea 0x20(%[dst_argb]), %[dst_argb] \n"
+
+// Store 8 RGBA values.
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm2,%%xmm1 \n" \
+ "punpcklbw %%xmm0,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5,(%[dst_rgba]) \n" \
+ "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
+ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+
+// Store 8 RGB24 values.
+#define STORERGB24 \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm2,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "pshufb %%xmm5,%%xmm0 \n" \
+ "pshufb %%xmm6,%%xmm1 \n" \
+ "palignr $0xc,%%xmm0,%%xmm1 \n" \
+ "movq %%xmm0,(%[dst_rgb24]) \n" \
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" \
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+
+// Store 8 AR30 values.
+#define STOREAR30 \
+ "psraw $0x4,%%xmm0 \n" \
+ "psraw $0x4,%%xmm1 \n" \
+ "psraw $0x4,%%xmm2 \n" \
+ "pminsw %%xmm7,%%xmm0 \n" \
+ "pminsw %%xmm7,%%xmm1 \n" \
+ "pminsw %%xmm7,%%xmm2 \n" \
+ "pmaxsw %%xmm6,%%xmm0 \n" \
+ "pmaxsw %%xmm6,%%xmm1 \n" \
+ "pmaxsw %%xmm6,%%xmm2 \n" \
+ "psllw $0x4,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm3 \n" \
+ "movdqa %%xmm1,%%xmm2 \n" \
+ "punpcklwd %%xmm5,%%xmm1 \n" \
+ "punpckhwd %%xmm5,%%xmm2 \n" \
+ "pslld $0xa,%%xmm1 \n" \
+ "pslld $0xa,%%xmm2 \n" \
+ "por %%xmm1,%%xmm0 \n" \
+ "por %%xmm2,%%xmm3 \n" \
+ "movdqu %%xmm0,(%[dst_ar30]) \n" \
+ "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
+ "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#ifdef HAS_I444ALPHATOARGBROW_SSSE3
+void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I444ALPHATOARGBROW_SSSE3
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STORERGB24
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STORERGB24
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+// 12 bit YUV to ARGB
+void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV210
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// 12 bit YUV to AR30
+void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV212
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV410
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN "1: \n" READYUVA210
+ YUVTORGB(yuvconstants) STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf),
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+}
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile(
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA410
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf),
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+ // clang-format on
+}
+#endif
+
+// 10 bit YUV to AR30
+void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV410
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I422ALPHATOARGBROW_SSSE3
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV21
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUY2
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READUYVY
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP(
+ yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN "1: \n" READP210
+ YUVTORGB(yuvconstants) STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [uv_buf] "+r"(uv_buf), // %[u_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+rm"(width) // %[width]
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+}
+
+void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP(
+ yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN "1: \n" READP410
+ YUVTORGB(yuvconstants) STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [uv_buf] "+r"(uv_buf), // %[u_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+rm"(width) // %[width]
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+}
+
+void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READP210
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READP410
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STORERGBA
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ "vmovq (%[u_buf]),%%xmm3 \n" \
+ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+#define READYUV422_AVX512BW \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \
+ "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
+ "vpermq $0xd8,%%zmm3,%%zmm3 \n" \
+ "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
+ "vmovdqu8 (%[y_buf]),%%ymm4 \n" \
+ "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
+ "vpermq $0xd8,%%zmm4,%%zmm4 \n" \
+ "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 210, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
+#define READYUVA210_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%ymm5 \n" \
+ "vpsraw $2,%%ymm5,%%ymm5 \n" \
+ "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
+ "lea 0x20(%[a_buf]),%[a_buf] \n"
+
+// Read 16 UV from 410
+#define READYUV410_AVX2 \
+ "vmovdqu (%[u_buf]),%%ymm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
+ "lea 0x20(%[u_buf]),%[u_buf] \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
+ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 212 12 bit, upsample to 16 UV
+#define READYUV212_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpsraw $0x4,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $4,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $8,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 16 UV from 410. With 16 Alpha.
+#define READYUVA410_AVX2 \
+ "vmovdqu (%[u_buf]),%%ymm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
+ "lea 0x20(%[u_buf]),%[u_buf] \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
+ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm2 \n" \
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" \
+ "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%ymm5 \n" \
+ "vpsraw $2,%%ymm5,%%ymm5 \n" \
+ "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
+ "lea 0x20(%[a_buf]),%[a_buf] \n"
+
+// Read 16 UV from 444. With 16 Alpha.
+#define READYUVA444_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 \
+ "vmovq (%[u_buf]),%%xmm3 \n" \
+ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+ "vmovdqu (%[uv_buf]),%%xmm3 \n" \
+ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+ "vmovdqu (%[vu_buf]),%%xmm3 \n" \
+ "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210_AVX2 \
+ "vmovdqu (%[uv_buf]),%%ymm3 \n" \
+ "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from P410
+#define READP410_AVX2 \
+ "vmovdqu (%[uv_buf]),%%ymm3 \n" \
+ "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \
+ "lea 0x40(%[uv_buf]),%[uv_buf] \n" \
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
+ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
+ "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \
+ "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \
+ "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+ "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
+ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
+ "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \
+ "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
+ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
+
+// TODO(fbarchard): Remove broadcastb
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
+ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
+ "vpsllw $7,%%xmm13,%%xmm13 \n" \
+ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
+ "vpbroadcastb %%xmm13,%%ymm13 \n" \
+ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
+ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
+ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
+
+#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
+ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
+ "movdqa (%[yuvconstants]),%%xmm8 \n" \
+ "vpbroadcastq %%xmm8, %%zmm8 \n" \
+ "vpsllw $7,%%xmm13,%%xmm13 \n" \
+ "vpbroadcastb %%xmm13,%%zmm13 \n" \
+ "movq 32(%[yuvconstants]),%%xmm9 \n" \
+ "vpbroadcastq %%xmm9,%%zmm9 \n" \
+ "movq 64(%[yuvconstants]),%%xmm10 \n" \
+ "vpbroadcastq %%xmm10,%%zmm10 \n" \
+ "movq 96(%[yuvconstants]),%%xmm11 \n" \
+ "vpbroadcastq %%xmm11,%%zmm11 \n" \
+ "movq 128(%[yuvconstants]),%%xmm12 \n" \
+ "vpbroadcastq %%xmm12,%%zmm12 \n" \
+ "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \
+ "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \
+ "vmovdqu8 (%[unperm]),%%zmm18 \n"
+
+#define YUVTORGB16_AVX2(yuvconstants) \
+ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
+ "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
+ "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
+ "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
+ "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
+ "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
+
+#define YUVTORGB16_AVX512BW(yuvconstants) \
+ "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
+ "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
+ "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
+ "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
+ "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
+ "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
+ "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
+ "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
+ "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
+
+#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
+#define YUVTORGB_REGS_AVX512BW \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
+
+#else // Convert 16 pixels: 16 UV and 16 Y.
+
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB16_AVX2(yuvconstants) \
+ "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
+ "vpsllw $7,%%xmm0,%%xmm0 \n" \
+ "vpbroadcastb %%xmm0,%%ymm0 \n" \
+ "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
+ "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
+ "vmovdqa (%[yuvconstants]),%%ymm0 \n" \
+ "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
+ "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
+ "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
+ "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
+
+#define YUVTORGB_REGS_AVX2
+#endif
+
+#define YUVTORGB_AVX2(yuvconstants) \
+ YUVTORGB16_AVX2(yuvconstants) \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
+#define YUVTORGB_AVX512BW(yuvconstants) \
+ YUVTORGB16_AVX512BW(yuvconstants) \
+ "vpsraw $0x6,%%zmm0,%%zmm0 \n" \
+ "vpsraw $0x6,%%zmm1,%%zmm1 \n" \
+ "vpsraw $0x6,%%zmm2,%%zmm2 \n" \
+ "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \
+ "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \
+ "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n"
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vmovdqu %%ymm1,(%[dst_argb]) \n" \
+ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
+ "lea 0x40(%[dst_argb]), %[dst_argb] \n"
+
+// Store 32 ARGB values.
+#define STOREARGB_AVX512BW \
+ "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
+ "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
+ "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
+ "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
+ "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
+ "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
+ "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \
+ "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \
+ "lea 0x80(%[dst_argb]), %[dst_argb] \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2 \
+ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
+ "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
+ "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
+ "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
+ "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
+ "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
+ "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
+ "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
+ "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
+ "vpslld $0xa,%%ymm1,%%ymm1 \n" \
+ "vpslld $0xa,%%ymm2,%%ymm2 \n" \
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
+ "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
+ "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
+static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
+static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// 32 pixels
+// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
+// bytes).
+void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX512BW(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
+ "vpbroadcastq %%xmm5,%%zmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX512BW
+ YUVTORGB_AVX512BW(yuvconstants)
+ STOREARGB_AVX512BW
+ "sub $0x20,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm]
+ [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
+ [unperm]"r"(kUnpermuteAVX512) // %[unperm]
+ : "memory", "cc", YUVTORGB_REGS_AVX512BW
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TOARGBROW_AVX512BW
+
+#if defined(HAS_I422TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I212TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I212TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I212TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I212TOAR30ROW_AVX2
+
+#if defined(HAS_I410TOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV410_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP_AVX2(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN "1: \n" READYUVA210_AVX2
+ YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf), // %[a_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5");
+}
+#endif // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP_AVX2(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN "1: \n" READYUVA410_AVX2
+ YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf), // %[a_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5");
+}
+#endif // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV410_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I410TOAR30ROW_AVX2
+
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA444_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I444ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+
+ // Step 3: Weave into RGBA
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%[dst_argb]) \n"
+ "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
+ "lea 0x40(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV12_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV21_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUY2_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READUYVY_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#if defined(HAS_P210TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP210_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_P210TOARGBROW_AVX2
+
+#if defined(HAS_P410TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP410_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_P410TOARGBROW_AVX2
+
+#if defined(HAS_P210TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP210_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_P210TOAR30ROW_AVX2
+
+#if defined(HAS_P410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP410_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_P410TOAR30ROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
+ "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
+ "pslld $0x18,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "paddsw %%xmm3,%%xmm0 \n"
+ "psraw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
+ "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ "vmovdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorSplitUV) // %4
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored. first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+ 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
+
+// Shuffle last 5 pixels to first 5 mirrored. last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+ 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ src_rgb24 += width * 3 - 48;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // first 5
+ "movdqu 15(%0),%%xmm1 \n" // next 5
+ "movdqu 30(%0),%%xmm2 \n" // next 5
+ "movdqu 32(%0),%%xmm3 \n" // last 1 special
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "lea -0x30(%0),%0 \n"
+ "movdqu %%xmm0,32(%1) \n" // last 5
+ "movdqu %%xmm1,17(%1) \n" // next 5
+ "movdqu %%xmm2,2(%1) \n" // next 5
+ "movlpd %%xmm3,0(%1) \n" // first 1
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorRGB0), // %3
+ "m"(kShuffleMirrorRGB1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_RGB24MIRRORROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "lea -0x10(%0,%2,4),%0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc", "xmm0");
+}
+#endif // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vmovdqu %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_DETILEROW_SSE2
+void DetileRow_SSE2(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "lea (%0,%3),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0");
+}
+#endif // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_SSE2
+void DetileRow_16_SSE2(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0", "xmm1");
+}
+#endif // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_AVX
+void DetileRow_16_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea (%0,%3,2),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0");
+}
+#endif // HAS_DETILEROW_AVX
+
+#ifdef HAS_DETILETOYUY2_SSE2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // Load 16 Y
+ "sub $0x10,%3 \n"
+ "lea (%0,%4),%0 \n"
+ "movdqu (%1),%%xmm1 \n" // Load 8 UV
+ "lea (%1,%5),%1 \n"
+ "movdqu %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list
+ );
+}
+#endif
+
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+// TODO(greenjustin): Look into generating these constants instead of loading
+// them since this can cause branch mispredicts for fPIC code on 32-bit
+// machines.
+static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 3, 5, 7, 9, 11, 13, 15};
+
+// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
+// slow on older SSE2 processors.
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqu %4,%%xmm1 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea (%0, %5),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "movhps %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "m"(kDeinterlaceUV), // %4
+ "r"(src_tile_stride) // %5
+ : "cc", "memory", "xmm0", "xmm1");
+}
+#endif // HAS_DETILESPLITUVROW_SSSE3
+
+#ifdef HAS_MERGEUVROW_AVX512BW
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile("sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%0),%%zmm0 \n"
+ "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsllw $0x8,%%zmm1,%%zmm1 \n"
+ "vporq %%zmm0,%%zmm1,%%zmm2 \n"
+ "vmovdqu64 %%zmm2,(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEUVROW_AVX512BW
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile("sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%0),%%ymm0 \n"
+ "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpsllw $0x8,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm0,%%ymm1,%%ymm2 \n"
+ "vmovdqu %%ymm2,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile("sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %4,%%xmm3 \n"
+ "vmovd %5,%%xmm4 \n"
+
+
+ "sub %0,%1 \n"
+ // 8 pixels per loop.
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm0 \n"
+ "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpslld %%xmm4,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm0,%%ymm1,%%ymm2 \n"
+ "vmovdqu %%ymm2,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(16 - depth), // %4
+ "r"(32 - depth) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ // clang-format on
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_16_AVX2
+const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15};
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ depth = 16 - depth;
+ // clang-format off
+ asm volatile (
+ "vmovd %4,%%xmm3 \n"
+ "vbroadcastf128 %5,%%ymm4 \n"
+ "sub %1,%2 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+
+ "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
+ "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
+ "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(depth), // %4
+ "m"(kSplitUVShuffle16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ // clang-format on
+}
+#endif // HAS_SPLITUVROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert msb formats to lsb, depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// 65536 = 16 bits
+#ifdef HAS_DIVIDEROW_16_AVX2
+void DivideRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width), // %2
+ "+r"(scale) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+#endif // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+#endif // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kSplitRGBShuffle[9] = {
+ {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
+ 7u, 10u, 13u},
+ {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
+ 8u, 11u, 14u},
+ {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
+ 12u, 15u}};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb 0(%5), %%xmm0 \n"
+ "pshufb 16(%5), %%xmm1 \n"
+ "pshufb 32(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb 48(%5),%%xmm0 \n"
+ "pshufb 64(%5),%%xmm1 \n"
+ "pshufb 80(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb 96(%5), %%xmm0 \n"
+ "pshufb 112(%5), %%xmm1 \n"
+ "pshufb 128(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "r"(&kSplitRGBShuffle[0]) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+// Shuffle table for converting Planar to RGB.
+static const uvec8 kMergeRGBShuffle[9] = {
+ {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
+ 128u, 5u},
+ {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
+ 128u, 128u},
+ {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
+ 4u, 128u},
+ {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
+ 10u, 128u},
+ {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
+ 128u, 10u},
+ {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
+ 128u, 128u},
+ {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
+ 15u, 128u, 128u},
+ {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
+ 128u, 15u, 128u},
+ {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
+ 128u, 128u, 15u}};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb (%5), %%xmm0 \n"
+ "pshufb 16(%5), %%xmm1 \n"
+ "pshufb 32(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb 48(%5), %%xmm0 \n"
+ "pshufb 64(%5), %%xmm1 \n"
+ "pshufb 80(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb 96(%5), %%xmm0 \n"
+ "pshufb 112(%5), %%xmm1 \n"
+ "pshufb 128(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : "r"(&kMergeRGBShuffle[0]) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_MERGEARGBROW_SSE2
+void MergeARGBRow_SSE2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movq (%0,%2),%%xmm0 \n" // B
+ "movq (%0),%%xmm1 \n" // R
+ "movq (%0,%1),%%xmm2 \n" // G
+ "punpcklbw %%xmm1,%%xmm0 \n" // BR
+ "movq (%0,%3),%%xmm1 \n" // A
+ "punpcklbw %%xmm1,%%xmm2 \n" // GA
+ "movdqa %%xmm0,%%xmm1 \n" // BR
+ "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
+ "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
+ "movdqu %%xmm0,(%4) \n"
+ "movdqu %%xmm1,16(%4) \n"
+
+ "lea 8(%0),%0 \n"
+ "lea 32(%4),%4 \n"
+ "sub $0x8,%5 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_SSE2
+void MergeXRGBRow_SSE2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+
+ "movq (%2),%%xmm0 \n" // B
+ "movq (%0),%%xmm1 \n" // R
+ "movq (%1),%%xmm2 \n" // G
+ "punpcklbw %%xmm1,%%xmm0 \n" // BR
+ "pcmpeqd %%xmm1,%%xmm1 \n" // A(255)
+ "punpcklbw %%xmm1,%%xmm2 \n" // GA
+ "movdqa %%xmm0,%%xmm1 \n" // BR
+ "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
+ "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,16(%3) \n"
+
+ "lea 8(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "lea 8(%2),%2 \n"
+ "lea 32(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEARGBROW_SSE2
+
+#ifdef HAS_MERGEARGBROW_AVX2
+void MergeARGBRow_AVX2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0,%2),%%xmm0 \n" // B
+ "vmovdqu (%0,%1),%%xmm1 \n" // R
+ "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
+ "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%4) \n" // First 8
+ "vmovdqu %%ymm1,32(%4) \n" // Next 8
+
+ "lea 16(%0),%0 \n"
+ "lea 64(%4),%4 \n"
+ "sub $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_AVX2
+void MergeXRGBRow_AVX2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%2),%%xmm0 \n" // B
+ "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
+ "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
+ "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3) \n" // First 8
+ "vmovdqu %%ymm1,32(%3) \n" // Next 8
+
+ "lea 16(%0),%0 \n"
+ "lea 16(%1),%1 \n"
+ "lea 16(%2),%2 \n"
+ "lea 64(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEARGBROW_AVX2
+
+#ifdef HAS_SPLITARGBROW_SSE2
+void SplitARGBRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+ "sub %1,%3 \n"
+ "sub %1,%4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%1,%3) \n" // B
+ "movhps %%xmm0,(%1,%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+ "movhps %%xmm2,(%1,%4) \n" // A
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "sub $0x8,%5 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+rm"(width) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_SSE2
+void SplitXRGBRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%3) \n" // B
+ "movhps %%xmm0,(%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "lea 8(%2),%2 \n"
+ "lea 8(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+#ifdef HAS_SPLITARGBROW_SSSE3
+void SplitARGBRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ "movdqa %6,%%xmm3 \n"
+ "sub %1,%2 \n"
+ "sub %1,%3 \n"
+ "sub %1,%4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%1,%3) \n" // B
+ "movhps %%xmm0,(%1,%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+ "movhps %%xmm2,(%1,%4) \n" // A
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "subl $0x8,%5 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(kShuffleMaskARGBSplit) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_SSSE3
+void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ "movdqa %5,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%3) \n" // B
+ "movhps %%xmm0,(%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "lea 8(%2),%2 \n"
+ "lea 8(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskARGBSplit) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+#endif
+
+#ifdef HAS_SPLITARGBROW_AVX2
+static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
+void SplitARGBRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+ "sub %1,%3 \n"
+ "sub %1,%4 \n"
+ "vmovdqa %7,%%ymm3 \n"
+ "vbroadcastf128 %6,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 00-0F
+ "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
+ "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
+ "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
+ "vmovdqu %%xmm0,(%1,%3) \n" // B
+ "vextracti128 $1,%%ymm0,(%1) \n" // R
+ "vmovdqu %%xmm2,(%1,%2) \n" // G
+ "vextracti128 $1,%%ymm2,(%1,%4) \n" // A
+ "lea 64(%0),%0 \n"
+ "lea 16(%1),%1 \n"
+ "subl $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(kShuffleMaskARGBSplit), // %6
+ "m"(kShuffleMaskARGBPermute) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_AVX2
+void SplitXRGBRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ "vmovdqa %6,%%ymm3 \n"
+ "vbroadcastf128 %5,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 00-0F
+ "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
+ "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
+ "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
+ "vmovdqu %%xmm0,(%3) \n" // B
+ "vextracti128 $1,%%ymm0,(%1) \n" // R
+ "vmovdqu %%xmm2,(%2) \n" // G
+
+ "lea 64(%0),%0 \n"
+ "lea 16(%1),%1 \n"
+ "lea 16(%2),%2 \n"
+ "lea 16(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskARGBSplit), // %5
+ "m"(kShuffleMaskARGBPermute) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ int shift = depth - 10;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $6,%%ymm6,%%ymm6 \n"
+ "vmovd %5,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1),%%ymm1 \n"
+ "vmovdqu (%0,%2),%%ymm2 \n"
+ "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
+ "vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
+ "vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
+ "vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
+ "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
+ "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
+ "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
+ "vpslld $0xa,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
+ "vpor %%ymm2,%%ymm3,%%ymm3 \n"
+ "vmovdqu %%ymm0,(%3) \n"
+ "vmovdqu %%ymm3,0x20(%3) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+#if defined(__i386__)
+ : "m"(shift) // %5
+#else
+ : "rm"(shift) // %5
+#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ mask = (mask << 16) + mask;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "vmovdqa %8,%%ymm5 \n"
+ "vmovd %6,%%xmm6 \n"
+ "vbroadcastss %7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vmovdqu (%0,%3),%%ymm3 \n" // A
+ "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
+ "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm5,%%ymm3 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
+ "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
+ "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
+ "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
+ "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
+ "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
+ "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
+ "vmovdqu %%ymm3,(%4) \n"
+ "vmovdqu %%ymm2,0x20(%4) \n"
+ "vmovdqu %%ymm4,0x40(%4) \n"
+ "vmovdqu %%ymm1,0x60(%4) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x80(%4),%4 \n"
+ "subl $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_ar64), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(shift), // %6
+ "m"(mask), // %7
+ "m"(MergeAR64Permute) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ mask = (mask << 16) + mask;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "vmovdqa %7,%%ymm5 \n"
+ "vmovd %5,%%xmm6 \n"
+ "vbroadcastss %6,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
+ "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
+ "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
+ "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
+ "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
+ "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
+ "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
+ "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
+ "vmovdqu %%ymm3,(%3) \n"
+ "vmovdqu %%ymm2,0x20(%3) \n"
+ "vmovdqu %%ymm4,0x40(%3) \n"
+ "vmovdqu %%ymm1,0x60(%3) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x80(%3),%3 \n"
+ "subl $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar64), // %3
+ "+r"(width) // %4
+ : "m"(shift), // %5
+ "m"(mask), // %6
+ "m"(MergeAR64Permute) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
+ 4, 12, 5, 13, 6, 14, 7, 15};
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = depth - 8;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "vbroadcastf128 %7,%%ymm5 \n"
+ "vmovd %6,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vmovdqu (%0,%3),%%ymm3 \n" // A
+ "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
+ "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
+ "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
+ "vmovdqu %%ymm2,(%4) \n"
+ "vmovdqu %%ymm0,0x20(%4) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%4),%4 \n"
+ "subl $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(shift), // %6
+ "m"(MergeARGB16To8Shuffle) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = depth - 8;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "vbroadcastf128 %6,%%ymm5 \n"
+ "vmovd %5,%%xmm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
+ "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
+ "vmovdqu %%ymm2,(%3) \n"
+ "vmovdqu %%ymm0,0x20(%3) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%3),%3 \n"
+ "subl $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "m"(shift), // %5
+ "m"(MergeARGB16To8Shuffle) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN "9: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile(
+
+ "rep movsb \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc");
+}
+#endif // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+ 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
+ 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ : "m"(kPermdARGBToY_AVX), // %3
+ "m"(kShuffleAlphaShort_AVX2) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
+ size_t width_tmp = (size_t)(width >> 2);
+ const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
+ asm volatile(
+
+ "rep stosl \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile(
+
+ "rep stosb \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile(
+
+ "rep stosl \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(stride_yuy2)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(stride_yuy2)) // %3
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+}
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
+ "vbroadcastss %%xmm6,%%ymm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
+ "vbroadcastss %%xmm7,%%ymm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 32 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128,
+ -128, -128, 14, -128, 14, -128,
+ 14, -128, -128, -128};
+
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "punpcklbw %%xmm6,%%xmm7 \n"
+ "sub %0,%1 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm2 \n" // a,a,a,0
+ "pshufb %%xmm4,%%xmm3 \n"
+ "pmullw %%xmm2,%%xmm0 \n" // rgb * alpha
+ "pmullw %%xmm3,%%xmm1 \n"
+ "paddw %%xmm7,%%xmm0 \n" // + 255
+ "paddw %%xmm7,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pand %%xmm5,%%xmm6 \n"
+ "por %%xmm6,%%xmm0 \n"
+ "movdqu %%xmm0,(%0,%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kAttenuateShuffle) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+
+// Shuffle table duplicating alpha.
+static const lvec8 kAttenuateShuffle_AVX2 = {
+ 6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14,
+ -128, 14, -128, -128, -128, 22, -128, 22, -128, 22, -128,
+ -128, -128, 30, -128, 30, -128, 30, -128, -128, -128};
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n"
+ "vpunpcklbw %%ymm6,%%ymm7,%%ymm7 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm5,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm5,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmullw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kAttenuateShuffle_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile(
+ "sub %0,%1 \n"
+ "vbroadcastf128 %5,%%ymm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ // replace VPGATHER
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
+ "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
+ "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
+ // end of VPGATHER
+
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8), // %4
+ "m"(kUnattenShuffleAlpha_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psubb %%xmm5,%%xmm0 \n"
+ "psubb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm4,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "movdqu %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm6 \n"
+ "paddw %%xmm5,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm6,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ asm volatile(
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ asm volatile(
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0");
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0");
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width) {
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop.
+ LABELALIGN
+ "10: \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
+ int width,
+ int area,
+ uint8_t* dst,
+ int count) {
+ asm volatile(
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
+
+ // 4 pixel small loop.
+ LABELALIGN
+ "4: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
+
+ // 4 pixel loop
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop
+ LABELALIGN
+ "10: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"((intptr_t)(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* src_dudv,
+ int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp;
+ asm volatile(
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop
+ LABELALIGN
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop
+ LABELALIGN
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "=&r"(temp) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ asm volatile(
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+rm"(width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ asm volatile(
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vbroadcastss %%xmm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
+ "vbroadcastss %%xmm4,%%ymm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ "vzeroupper \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+
+ "movdqu (%3),%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+
+ "vbroadcastf128 (%3),%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOUYVYROW_AVX2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ asm volatile(
+
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 (%3),%%ymm4 \n"
+ "vbroadcastf128 0x10(%3),%%ymm5 \n"
+ "vbroadcastf128 0x20(%3),%%ymm6 \n"
+ "vbroadcastf128 0x30(%3),%%ymm7 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
+ "lea 0x8(%0),%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
+ // X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "vmovq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ scale *= kScaleBias;
+ asm volatile(
+ "movd %3,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n" // 8 shorts
+ "add $0x10,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
+ "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
+ "punpckhwd %%xmm5,%%xmm3 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "psrld $0xd,%%xmm2 \n"
+ "psrld $0xd,%%xmm3 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,-0x10(%0,%1,1) \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(scale) // %3
+ : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ scale *= kScaleBias;
+ asm volatile(
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
+ : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd 0x10(%0),%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
+ "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
+ : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd 0x10(%0),%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
+ "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm2", "xmm3");
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile(
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile(
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
+ uintptr_t pixel_temp;
+ uintptr_t table_temp;
+ asm volatile(
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
+ : "=&d"(pixel_temp), // %0
+ "=&a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+static const uvec8 kYUV24Shuffle[3] = {
+ {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
+ {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
+ {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "movdqa (%4),%%xmm4 \n" // 3 shuffler constants
+ "movdqa 16(%4),%%xmm5 \n"
+ "movdqa 32(%4),%%xmm6 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n" // load 16 Y values
+ "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values
+ "lea 16(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3
+ "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5
+ "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7
+ "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24
+ "pshufb %%xmm5, %%xmm1 \n"
+ "pshufb %%xmm6, %%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm1,16(%2) \n"
+ "movdqu %%xmm2,32(%2) \n"
+ "lea 48(%2),%2 \n"
+ "sub $16,%3 \n" // 16 pixels per loop
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Shuffle[0]) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
+ "vbroadcastf128 16(%4),%%ymm5 \n"
+ "vbroadcastf128 32(%4),%%ymm6 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
+ "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
+ "lea 32(%0),%0 \n"
+ "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
+ "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
+ "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
+ "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm3,(%2) \n"
+ "vmovdqu %%ymm0,32(%2) \n"
+ "vmovdqu %%ymm1,64(%2) \n"
+ "lea 96(%2),%2 \n"
+ "sub $32,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Shuffle[0]) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_NV21ToYUV24ROW_AVX512
+// The following VMBI VEX256 code tests okay with the intelsde emulator.
+static const lvec8 kYUV24Perm[3] = {
+ {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
+ 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43},
+ {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
+ 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
+ {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
+ 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
+
+void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
+ "vmovdqa 32(%4),%%ymm5 \n"
+ "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
+ "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
+ "lea 32(%0),%0 \n"
+ "vmovdqa %%ymm2, %%ymm0 \n"
+ "vmovdqa %%ymm2, %%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
+ "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "vmovdqu %%ymm1,32(%2) \n"
+ "vmovdqu %%ymm2,64(%2) \n"
+ "lea 96(%2),%2 \n"
+ "sub $32,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Perm[0]) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif // HAS_NV21ToYUV24ROW_AVX512
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "movdqu %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+ asm volatile(
+ "pxor %%xmm1,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n" // load float
+ "maxss %%xmm1, %%xmm0 \n" // clamp to zero
+ "add 4, %0 \n"
+ "movd %%xmm0, (%1) \n" // store float
+ "add 4, %1 \n"
+ "sub $0x4,%2 \n" // 1 float per loop
+ "jg 1b \n"
+ : "+r"(src_x), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
new file mode 100644
index 00000000..be85022e
--- /dev/null
+++ b/source/row_lasx.cc
@@ -0,0 +1,2304 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
+ { \
+ ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \
+ vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \
+ ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \
+ vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \
+ yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \
+ yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
+ }
+
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+ { \
+ __m256i temp0, temp1; \
+ \
+ DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
+ temp1 = __lasx_xvld(psrc_v, 0); \
+ temp0 = __lasx_xvsub_b(temp0, const_0x80); \
+ temp1 = __lasx_xvsub_b(temp1, const_0x80); \
+ temp0 = __lasx_vext2xv_h_b(temp0); \
+ temp1 = __lasx_vext2xv_h_b(temp1); \
+ uv_l = __lasx_xvilvl_h(temp0, temp1); \
+ uv_h = __lasx_xvilvh_h(temp0, temp1); \
+ }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+ { \
+ __m256i temp0, temp1; \
+ \
+ out_y = __lasx_xvld(psrc_y, 0); \
+ temp0 = __lasx_xvldrepl_d(psrc_u, 0); \
+ temp1 = __lasx_xvldrepl_d(psrc_v, 0); \
+ uv = __lasx_xvilvl_b(temp0, temp1); \
+ uv = __lasx_xvsub_b(uv, const_0x80); \
+ uv = __lasx_vext2xv_h_b(uv); \
+ }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
+ g_h, r_l, r_h) \
+ { \
+ __m256i u_l, u_h, v_l, v_h; \
+ __m256i yl_ev, yl_od, yh_ev, yh_od; \
+ __m256i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lasx_xvilvl_b(in_y, in_y); \
+ temp1 = __lasx_xvilvh_b(in_y, in_y); \
+ yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
+ yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
+ yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \
+ yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \
+ DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
+ yl_ev, yl_od, yh_ev, yh_od); \
+ yl_ev = __lasx_xvadd_w(yl_ev, yb); \
+ yl_od = __lasx_xvadd_w(yl_od, yb); \
+ yh_ev = __lasx_xvadd_w(yh_ev, yb); \
+ yh_od = __lasx_xvadd_w(yh_od, yb); \
+ v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \
+ u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \
+ v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \
+ u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \
+ temp0 = __lasx_xvadd_w(yl_ev, u_l); \
+ temp1 = __lasx_xvadd_w(yl_od, u_l); \
+ temp2 = __lasx_xvadd_w(yh_ev, u_h); \
+ temp3 = __lasx_xvadd_w(yh_od, u_h); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ b_l = __lasx_xvpackev_h(temp1, temp0); \
+ b_h = __lasx_xvpackev_h(temp3, temp2); \
+ temp0 = __lasx_xvadd_w(yl_ev, v_l); \
+ temp1 = __lasx_xvadd_w(yl_od, v_l); \
+ temp2 = __lasx_xvadd_w(yh_ev, v_h); \
+ temp3 = __lasx_xvadd_w(yh_od, v_h); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ r_l = __lasx_xvpackev_h(temp1, temp0); \
+ r_h = __lasx_xvpackev_h(temp3, temp2); \
+ DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
+ temp0 = __lasx_xvsub_w(yl_ev, u_l); \
+ temp1 = __lasx_xvsub_w(yl_od, u_l); \
+ temp2 = __lasx_xvsub_w(yh_ev, u_h); \
+ temp3 = __lasx_xvsub_w(yh_od, u_h); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ g_l = __lasx_xvpackev_h(temp1, temp0); \
+ g_h = __lasx_xvpackev_h(temp3, temp2); \
+ }
+
+// Convert 8 pixels of YUV420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
+ { \
+ __m256i u_l, v_l, yl_ev, yl_od; \
+ __m256i temp0, temp1; \
+ \
+ in_y = __lasx_xvpermi_d(in_y, 0xD8); \
+ temp0 = __lasx_xvilvl_b(in_y, in_y); \
+ yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
+ yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
+ DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \
+ yl_ev = __lasx_xvadd_w(yl_ev, yb); \
+ yl_od = __lasx_xvadd_w(yl_od, yb); \
+ v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \
+ u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \
+ temp0 = __lasx_xvadd_w(yl_ev, u_l); \
+ temp1 = __lasx_xvadd_w(yl_od, u_l); \
+ DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
+ DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
+ out_b = __lasx_xvpackev_h(temp1, temp0); \
+ temp0 = __lasx_xvadd_w(yl_ev, v_l); \
+ temp1 = __lasx_xvadd_w(yl_od, v_l); \
+ DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
+ DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
+ out_r = __lasx_xvpackev_h(temp1, temp0); \
+ u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \
+ temp0 = __lasx_xvsub_w(yl_ev, u_l); \
+ temp1 = __lasx_xvsub_w(yl_od, u_l); \
+ DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
+ DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
+ out_g = __lasx_xvpackev_h(temp1, temp0); \
+ }
+
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+ { \
+ __m256i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lasx_xvpackev_b(g_l, b_l); \
+ temp1 = __lasx_xvpackev_b(a_l, r_l); \
+ temp2 = __lasx_xvpackev_b(g_h, b_h); \
+ temp3 = __lasx_xvpackev_b(a_h, r_h); \
+ r_l = __lasx_xvilvl_h(temp1, temp0); \
+ r_h = __lasx_xvilvh_h(temp1, temp0); \
+ g_l = __lasx_xvilvl_h(temp3, temp2); \
+ g_h = __lasx_xvilvh_h(temp3, temp2); \
+ temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \
+ temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \
+ temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \
+ temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \
+ __lasx_xvst(temp0, pdst_argb, 0); \
+ __lasx_xvst(temp1, pdst_argb, 32); \
+ __lasx_xvst(temp2, pdst_argb, 64); \
+ __lasx_xvst(temp3, pdst_argb, 96); \
+ pdst_argb += 128; \
+ }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+ { \
+ __m256i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lasx_xvpackev_b(in_g, in_b); \
+ temp1 = __lasx_xvpackev_b(in_a, in_r); \
+ temp2 = __lasx_xvilvl_h(temp1, temp0); \
+ temp3 = __lasx_xvilvh_h(temp1, temp0); \
+ temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \
+ temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \
+ __lasx_xvst(temp0, pdst_argb, 0); \
+ __lasx_xvst(temp1, pdst_argb, 32); \
+ pdst_argb += 64; \
+ }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
+ { \
+ __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
+ _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \
+ _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \
+ _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \
+ _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \
+ _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \
+ _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \
+ _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \
+ _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \
+ _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \
+ _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \
+ _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \
+ _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \
+ _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \
+ _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \
+ _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \
+ }
+
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 64;
+ __m256i src0, src1;
+ __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607,
+ 0x08090A0B0C0D0E0F, 0x0001020304050607};
+ src += width - 64;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+ DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+ src1);
+ src0 = __lasx_xvpermi_q(src0, src0, 0x01);
+ src1 = __lasx_xvpermi_q(src1, src1, 0x01);
+ __lasx_xvst(src1, dst, 0);
+ __lasx_xvst(src0, dst, 32);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src, dst;
+ __m256i shuffler = {0x0004000500060007, 0x0000000100020003,
+ 0x0004000500060007, 0x0000000100020003};
+
+ src_uv += (width - 16) << 1;
+ for (x = 0; x < len; x++) {
+ src = __lasx_xvld(src_uv, 0);
+ dst = __lasx_xvshuf_h(shuffler, src, src);
+ dst = __lasx_xvpermi_q(dst, dst, 0x01);
+ __lasx_xvst(dst, dst_uv, 0);
+ src_uv -= 32;
+ dst_uv += 32;
+ }
+}
+
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1;
+ __m256i dst0, dst1;
+ __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504,
+ 0x0B0A09080F0E0D0C, 0x0302010007060504};
+ src += (width * 4) - 64;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+ DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+ src1);
+ dst1 = __lasx_xvpermi_q(src0, src0, 0x01);
+ dst0 = __lasx_xvpermi_q(src1, src1, 0x01);
+ __lasx_xvst(dst0, dst, 0);
+ __lasx_xvst(dst1, dst, 32);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src_u0, src_v0, src_y0, vec_uv0;
+ __m256i vec_yuy2_0, vec_yuy2_1;
+ __m256i dst_yuy2_0, dst_yuy2_1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lasx_xvld(src_y, 0);
+ src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+ src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+ vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+ vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0);
+ vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0);
+ dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20);
+ dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31);
+ __lasx_xvst(dst_yuy2_0, dst_yuy2, 0);
+ __lasx_xvst(dst_yuy2_1, dst_yuy2, 32);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_yuy2 += 64;
+ }
+}
+
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src_u0, src_v0, src_y0, vec_uv0;
+ __m256i vec_uyvy0, vec_uyvy1;
+ __m256i dst_uyvy0, dst_uyvy1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lasx_xvld(src_y, 0);
+ src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+ src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+ vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+ vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0);
+ vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0);
+ dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20);
+ dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31);
+ __lasx_xvst(dst_uyvy0, dst_uyvy, 0);
+ __lasx_xvst(dst_uyvy1, dst_uyvy, 32);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_uyvy += 64;
+ }
+}
+
+void I422ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422ToRGBARow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ int res = width & 31;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+ y = __lasx_xvld(src_a, 0);
+ a_l = __lasx_xvilvl_b(zero, y);
+ a_h = __lasx_xvilvh_b(zero, y);
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ src_a += 32;
+ }
+ if (res) {
+ __m256i y, uv, r, g, b, a;
+ a = __lasx_xvld(src_a, 0);
+ a = __lasx_vext2xv_hu_bu(a);
+ READYUV422(src_y, src_u, src_v, y, uv);
+ YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+ STOREARGB(a, r, g, b, dst_argb);
+ }
+}
+
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int32_t width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614,
+ 0x0504120302100100, 0x0A18090816070614};
+ __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B,
+ 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i temp0, temp1, temp2, temp3;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ temp0 = __lasx_xvpackev_b(g_l, b_l);
+ temp1 = __lasx_xvpackev_b(g_h, b_h);
+ DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1,
+ r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+ temp1);
+
+ b_l = __lasx_xvilvl_d(temp1, temp2);
+ b_h = __lasx_xvilvh_d(temp3, temp1);
+ temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20);
+ temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30);
+ temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31);
+ __lasx_xvst(temp1, dst_argb, 0);
+ __lasx_xvst(temp2, dst_argb, 32);
+ __lasx_xvst(temp3, dst_argb, 64);
+ dst_argb += 96;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 3);
+ b_h = __lasx_xvsrli_h(b_h, 3);
+ g_l = __lasx_xvsrli_h(g_l, 2);
+ g_h = __lasx_xvsrli_h(g_h, 2);
+ r_l = __lasx_xvsrli_h(r_l, 3);
+ r_h = __lasx_xvsrli_h(r_h, 3);
+ r_l = __lasx_xvslli_h(r_l, 11);
+ r_h = __lasx_xvslli_h(r_h, 11);
+ g_l = __lasx_xvslli_h(g_l, 5);
+ g_h = __lasx_xvslli_h(g_h, 5);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_rgb565, 0);
+ __lasx_xvst(dst_h, dst_rgb565, 32);
+ dst_rgb565 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = (__m256i)v4u64{0xF000F000F000F000, 0xF000F000F000F000,
+ 0xF000F000F000F000, 0xF000F000F000F000};
+ __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0,
+ 0x00F000F000F000F0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 4);
+ b_h = __lasx_xvsrli_h(b_h, 4);
+ r_l = __lasx_xvsrli_h(r_l, 4);
+ r_h = __lasx_xvsrli_h(r_h, 4);
+ g_l = __lasx_xvand_v(g_l, mask);
+ g_h = __lasx_xvand_v(g_h, mask);
+ r_l = __lasx_xvslli_h(r_l, 8);
+ r_h = __lasx_xvslli_h(r_h, 8);
+ r_l = __lasx_xvor_v(r_l, alpha);
+ r_h = __lasx_xvor_v(r_h, alpha);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_argb4444, 0);
+ __lasx_xvst(dst_h, dst_argb4444, 32);
+ dst_argb4444 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
+ 0x8000800080008000, 0x8000800080008000};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 3);
+ b_h = __lasx_xvsrli_h(b_h, 3);
+ g_l = __lasx_xvsrli_h(g_l, 3);
+ g_h = __lasx_xvsrli_h(g_h, 3);
+ g_l = __lasx_xvslli_h(g_l, 5);
+ g_h = __lasx_xvslli_h(g_h, 5);
+ r_l = __lasx_xvsrli_h(r_l, 3);
+ r_h = __lasx_xvsrli_h(r_h, 3);
+ r_l = __lasx_xvslli_h(r_l, 10);
+ r_h = __lasx_xvslli_h(r_h, 10);
+ r_l = __lasx_xvor_v(r_l, alpha);
+ r_h = __lasx_xvor_v(r_h, alpha);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_argb1555, 0);
+ __lasx_xvst(dst_h, dst_argb1555, 32);
+ dst_argb1555 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+ dst0 = __lasx_xvpickev_b(src1, src0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_yuy2 += 64;
+ dst_y += 32;
+ }
+}
+
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0,
+ src_yuy2_next, 32, src0, src1, src2, src3);
+ src0 = __lasx_xvpickod_b(src1, src0);
+ src1 = __lasx_xvpickod_b(src3, src2);
+ tmp0 = __lasx_xvavgr_bu(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_yuy2 += 64;
+ src_yuy2_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+ tmp0 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_yuy2 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+ dst0 = __lasx_xvpickod_b(src1, src0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_uyvy += 64;
+ dst_y += 32;
+ }
+}
+
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0,
+ src_uyvy_next, 32, src0, src1, src2, src3);
+ src0 = __lasx_xvpickev_b(src1, src0);
+ src1 = __lasx_xvpickev_b(src3, src2);
+ tmp0 = __lasx_xvavgr_bu(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_uyvy += 64;
+ src_uyvy_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_uyvy += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToUVRow_LASX(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+ __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m256i vec0, vec1, vec2, vec3;
+ __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+ __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
+ 0x0038003800380038, 0x0038003800380038};
+ __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
+ 0x0025002500250025, 0x0025002500250025};
+ __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
+ 0x0013001300130013, 0x0013001300130013};
+ __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
+ 0x002f002f002f002f, 0x002f002f002f002f};
+ __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
+ 0x0009000900090009, 0x0009000900090009};
+ __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+ 0x0000000700000003};
+ __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
+ src_argb0, 96, src0, src1, src2, src3);
+ DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64,
+ src_argb1, 96, src4, src5, src6, src7);
+ vec0 = __lasx_xvaddwev_h_bu(src0, src4);
+ vec1 = __lasx_xvaddwev_h_bu(src1, src5);
+ vec2 = __lasx_xvaddwev_h_bu(src2, src6);
+ vec3 = __lasx_xvaddwev_h_bu(src3, src7);
+ tmp0 = __lasx_xvpickev_h(vec1, vec0);
+ tmp1 = __lasx_xvpickev_h(vec3, vec2);
+ tmp2 = __lasx_xvpickod_h(vec1, vec0);
+ tmp3 = __lasx_xvpickod_h(vec3, vec2);
+ vec0 = __lasx_xvaddwod_h_bu(src0, src4);
+ vec1 = __lasx_xvaddwod_h_bu(src1, src5);
+ vec2 = __lasx_xvaddwod_h_bu(src2, src6);
+ vec3 = __lasx_xvaddwod_h_bu(src3, src7);
+ tmp4 = __lasx_xvpickev_h(vec1, vec0);
+ tmp5 = __lasx_xvpickev_h(vec3, vec2);
+ vec0 = __lasx_xvpickev_h(tmp1, tmp0);
+ vec1 = __lasx_xvpickod_h(tmp1, tmp0);
+ src0 = __lasx_xvavgr_h(vec0, vec1);
+ vec0 = __lasx_xvpickev_h(tmp3, tmp2);
+ vec1 = __lasx_xvpickod_h(tmp3, tmp2);
+ src1 = __lasx_xvavgr_h(vec0, vec1);
+ vec0 = __lasx_xvpickev_h(tmp5, tmp4);
+ vec1 = __lasx_xvpickod_h(tmp5, tmp4);
+ src2 = __lasx_xvavgr_h(vec0, vec1);
+ dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
+ dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
+ dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
+ dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
+ dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
+ dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ dst1 = __lasx_xvperm_w(dst1, control);
+ dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8);
+ dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_argb0 += 128;
+ src_argb1 += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 32) - 1;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100,
+ 0x000000000E0D0C0A};
+ __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
+ 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ __lasx_xvst(tmp3, dst_rgb, 72);
+ dst_rgb += 96;
+ src_argb += 128;
+ }
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
+ src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ dst_rgb += 72;
+ __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 32) - 1;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102,
+ 0x000000000C0D0E08};
+ __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
+ 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ __lasx_xvst(tmp3, dst_rgb, 72);
+ dst_rgb += 96;
+ src_argb += 128;
+ }
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
+ src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ dst_rgb += 72;
+ __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, tmp0, tmp1, dst0;
+ __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300,
+ 0x0300030003000300};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 3);
+ tmp1 = __lasx_xvpackev_b(zero, tmp1);
+ tmp1 = __lasx_xvsrli_h(tmp1, 2);
+ tmp0 = __lasx_xvsll_b(tmp0, shift);
+ tmp1 = __lasx_xvslli_h(tmp1, 5);
+ dst0 = __lasx_xvor_v(tmp0, tmp1);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+ __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703,
+ 0x0703070307030703};
+ __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200,
+ 0x0200020002000200};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 3);
+ tmp1 = __lasx_xvsrl_b(tmp1, shift1);
+ tmp0 = __lasx_xvsll_b(tmp0, shift2);
+ tmp2 = __lasx_xvpackev_b(zero, tmp1);
+ tmp3 = __lasx_xvpackod_b(zero, tmp1);
+ tmp2 = __lasx_xvslli_h(tmp2, 5);
+ tmp3 = __lasx_xvslli_h(tmp3, 15);
+ dst0 = __lasx_xvor_v(tmp0, tmp2);
+ dst0 = __lasx_xvor_v(dst0, tmp3);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp1 = __lasx_xvandi_b(tmp1, 0xF0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 4);
+ dst0 = __lasx_xvor_v(tmp1, tmp0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int32_t width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, reg3, dst0, dst1;
+ __m256i const_112 = __lasx_xvldi(112);
+ __m256i const_74 = __lasx_xvldi(74);
+ __m256i const_38 = __lasx_xvldi(38);
+ __m256i const_94 = __lasx_xvldi(94);
+ __m256i const_18 = __lasx_xvldi(18);
+ __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+ 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvpickev_h(src1, src0);
+ tmp1 = __lasx_xvpickod_h(src1, src0);
+ tmp2 = __lasx_xvpickev_h(src3, src2);
+ tmp3 = __lasx_xvpickod_h(src3, src2);
+ reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
+ reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
+ reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
+ reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
+ reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
+ reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38);
+ reg0 = __lasx_xvsub_h(reg0, reg2);
+ reg1 = __lasx_xvsub_h(reg1, reg3);
+ dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
+ reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
+ reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
+ reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
+ reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
+ reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94);
+ reg0 = __lasx_xvsub_h(reg0, reg2);
+ reg1 = __lasx_xvsub_h(reg1, reg3);
+ dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+ dst1 = __lasx_xvperm_w(dst1, control);
+ __lasx_xvst(dst0, dst_u, 0);
+ __lasx_xvst(dst1, dst_v, 0);
+ dst_u += 32;
+ dst_v += 32;
+ src_argb += 128;
+ }
+}
+
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, dst0, dst1;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ tmp0 = __lasx_xvilvl_b(src0, src0);
+ tmp1 = __lasx_xvilvh_b(src0, src0);
+ tmp2 = __lasx_xvilvl_b(zero, src1);
+ tmp3 = __lasx_xvilvh_b(zero, src1);
+ dst0 = __lasx_xvmuh_hu(tmp0, tmp2);
+ dst1 = __lasx_xvmuh_hu(tmp1, tmp3);
+ dst0 = __lasx_xvpickev_b(dst1, dst0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lasx_xvsadd_bu(src0, src1);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lasx_xvssub_bu(src0, src1);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m256i b, g, r, a, dst0, dst1;
+ __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000,
+ 0x0007000300060002};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ b = __lasx_xvpackev_b(tmp0, tmp0);
+ r = __lasx_xvpackod_b(tmp0, tmp0);
+ g = __lasx_xvpackev_b(tmp1, tmp1);
+ a = __lasx_xvpackod_b(tmp1, tmp1);
+ reg0 = __lasx_xvmulwev_w_hu(b, a);
+ reg1 = __lasx_xvmulwod_w_hu(b, a);
+ reg2 = __lasx_xvmulwev_w_hu(r, a);
+ reg3 = __lasx_xvmulwod_w_hu(r, a);
+ reg4 = __lasx_xvmulwev_w_hu(g, a);
+ reg5 = __lasx_xvmulwod_w_hu(g, a);
+ reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
+ reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
+ reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
+ reg0 = __lasx_xvshuf_h(control, reg0, reg0);
+ reg2 = __lasx_xvshuf_h(control, reg2, reg2);
+ reg4 = __lasx_xvshuf_h(control, reg4, reg4);
+ tmp0 = __lasx_xvpackev_b(reg4, reg0);
+ tmp1 = __lasx_xvpackev_b(a, reg2);
+ dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+ dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ dst_argb += 64;
+ src_argb += 64;
+ }
+}
+
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1, dst0;
+ __m256i b, g, r;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0);
+
+ vec_dither = __lasx_xvilvl_b(zero, vec_dither);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ b = __lasx_xvpackev_b(zero, tmp0);
+ r = __lasx_xvpackod_b(zero, tmp0);
+ g = __lasx_xvpackev_b(zero, tmp1);
+ b = __lasx_xvadd_h(b, vec_dither);
+ g = __lasx_xvadd_h(g, vec_dither);
+ r = __lasx_xvadd_h(r, vec_dither);
+ DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g);
+ r = __lasx_xvclip255_h(r);
+ b = __lasx_xvsrai_h(b, 3);
+ g = __lasx_xvsrai_h(g, 2);
+ r = __lasx_xvsrai_h(r, 3);
+ g = __lasx_xvslli_h(g, 5);
+ r = __lasx_xvslli_h(r, 11);
+ dst0 = __lasx_xvor_v(b, g);
+ dst0 = __lasx_xvor_v(dst0, r);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ src_argb += 64;
+ dst_rgb += 32;
+ }
+}
+
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, dst0, dst1;
+ __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000,
+ 0x0C0C0C0C08080808};
+ __m256i temp = __lasx_xvldrepl_w(shuffler, 0);
+
+ shuf = __lasx_xvadd_b(shuf, temp);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ dst0 = __lasx_xvshuf_b(src0, src0, shuf);
+ dst1 = __lasx_xvshuf_b(src1, src1, shuf);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ src_argb += 64;
+ dst_argb += 64;
+ }
+}
+
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ int x;
+ int len = width / 8;
+ __m256i src0, dst0, tmp0, tmp1;
+ __m256i vec_value = __lasx_xvreplgr2vr_w(value);
+
+ vec_value = __lasx_xvilvl_b(vec_value, vec_value);
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb, 0);
+ tmp0 = __lasx_xvilvl_b(src0, src0);
+ tmp1 = __lasx_xvilvh_b(src0, src0);
+ tmp0 = __lasx_xvmuh_hu(tmp0, vec_value);
+ tmp1 = __lasx_xvmuh_hu(tmp1, vec_value);
+ dst0 = __lasx_xvpickod_b(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, reg2, dst0, dst1;
+ __m256i const_128 = __lasx_xvldi(0x480);
+ __m256i const_150 = __lasx_xvldi(0x96);
+ __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
+ 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ reg0 = __lasx_xvdp2_h_bu(tmp0, const_br);
+ reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
+ reg2 = __lasx_xvadd_h(reg0, reg1);
+ tmp0 = __lasx_xvpackod_b(reg2, reg2);
+ tmp1 = __lasx_xvpackod_b(tmp1, reg2);
+ dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+ dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ src_argb += 64;
+ dst_argb += 64;
+ }
+}
+
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, spb, spg, spr;
+ __m256i dst0, dst1;
+ __m256i spb_g = __lasx_xvldi(68);
+ __m256i spg_g = __lasx_xvldi(88);
+ __m256i spr_g = __lasx_xvldi(98);
+ __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311,
+ 0x2311231123112311};
+ __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16,
+ 0x2D162D162D162D16};
+ __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218,
+ 0x3218321832183218};
+ __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100,
+ 0x1F0E1D0C1B0A1908};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+ spr = __lasx_xvdp2_h_bu(tmp0, spr_br);
+ spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g);
+ spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g);
+ spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g);
+ spb = __lasx_xvsrli_h(spb, 7);
+ spg = __lasx_xvsrli_h(spg, 7);
+ spr = __lasx_xvsrli_h(spr, 7);
+ spg = __lasx_xvsat_hu(spg, 7);
+ spr = __lasx_xvsat_hu(spr, 7);
+ reg0 = __lasx_xvpackev_b(spg, spb);
+ reg1 = __lasx_xvshuf_b(tmp1, spr, shuff);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ dst_argb += 64;
+ }
+}
+
+void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, reg3;
+ __m256i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb4444, 0);
+ src1 = __lasx_xvld(src_argb4444, 32);
+ DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2);
+ DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3);
+ DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2);
+ DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2,
+ 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ __lasx_xvst(dst2, dst_argb, 64);
+ __lasx_xvst(dst3, dst_argb, 96);
+ src_argb4444 += 64;
+ dst_argb += 128;
+ }
+}
+
+void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
+ __m256i reg0, reg1, reg2, reg3;
+ __m256i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb1555, 0);
+ src1 = __lasx_xvld(src_argb1555, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpg = __lasx_xvsrli_b(tmp0, 5);
+ reg0 = __lasx_xvandi_b(tmp1, 0x03);
+ reg0 = __lasx_xvslli_b(reg0, 3);
+ tmpg = __lasx_xvor_v(tmpg, reg0);
+ reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+ tmpr = __lasx_xvsrli_b(reg1, 2);
+ tmpa = __lasx_xvsrli_b(tmp1, 7);
+ tmpa = __lasx_xvneg_b(tmpa);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvslli_b(tmpg, 3);
+ reg2 = __lasx_xvslli_b(tmpr, 3);
+ tmpb = __lasx_xvsrli_b(tmpb, 2);
+ tmpg = __lasx_xvsrli_b(tmpg, 2);
+ tmpr = __lasx_xvsrli_b(tmpr, 2);
+ tmpb = __lasx_xvor_v(reg0, tmpb);
+ tmpg = __lasx_xvor_v(reg1, tmpg);
+ tmpr = __lasx_xvor_v(reg2, tmpr);
+ DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+ DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ dst2 = __lasx_xvilvl_h(reg3, reg2);
+ dst3 = __lasx_xvilvh_h(reg3, reg2);
+ DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
+ 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
+ __lasx_xvst(reg0, dst_argb, 0);
+ __lasx_xvst(reg1, dst_argb, 32);
+ __lasx_xvst(reg2, dst_argb, 64);
+ __lasx_xvst(reg3, dst_argb, 96);
+ src_argb1555 += 64;
+ dst_argb += 128;
+ }
+}
+
+void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3;
+ __m256i alpha = __lasx_xvldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_rgb565, 0);
+ src1 = __lasx_xvld(src_rgb565, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+ reg1 = __lasx_xvandi_b(tmp1, 0x07);
+ reg0 = __lasx_xvsrli_b(tmp0, 5);
+ reg1 = __lasx_xvslli_b(reg1, 3);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvsrli_b(tmpb, 2);
+ tmpb = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpg, 2);
+ reg1 = __lasx_xvsrli_b(tmpg, 4);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvsrli_b(tmpr, 5);
+ tmpr = __lasx_xvor_v(tmpr, reg0);
+ DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst2 = __lasx_xvilvl_h(reg1, reg0);
+ dst3 = __lasx_xvilvh_h(reg1, reg0);
+ DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
+ 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
+ __lasx_xvst(reg0, dst_argb, 0);
+ __lasx_xvst(reg1, dst_argb, 32);
+ __lasx_xvst(reg2, dst_argb, 64);
+ __lasx_xvst(reg3, dst_argb, 96);
+ src_rgb565 += 64;
+ dst_argb += 128;
+ }
+}
+
+void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2;
+ __m256i tmp0, tmp1, tmp2;
+ __m256i dst0, dst1, dst2, dst3;
+ __m256i reg0, reg1, reg2, reg3;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
+ 0x1B1A191817161514};
+ __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
+ 0x0706050403020100};
+ __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
+ 0x131211100F0E0D0C};
+ __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100,
+ 0x100B0A0910080706};
+
+ for (x = 0; x < len; x++) {
+ reg0 = __lasx_xvld(src_rgb24, 0);
+ reg1 = __lasx_xvld(src_rgb24, 32);
+ reg2 = __lasx_xvld(src_rgb24, 64);
+ src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+ src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+ src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
+ tmp1);
+ tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
+ 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ __lasx_xvst(dst2, dst_argb, 64);
+ __lasx_xvst(dst3, dst_argb, 96);
+ src_rgb24 += 96;
+ dst_argb += 128;
+ }
+}
+
+void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2;
+ __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
+ __m256i dst0, dst1, dst2, dst3;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
+ 0x1B1A191817161514};
+ __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
+ 0x0706050403020100};
+ __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
+ 0x131211100F0E0D0C};
+ __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102,
+ 0x10090A0B10060708};
+
+ for (x = 0; x < len; x++) {
+ reg0 = __lasx_xvld(src_raw, 0);
+ reg1 = __lasx_xvld(src_raw, 32);
+ reg2 = __lasx_xvld(src_raw, 64);
+ src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+ src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+ src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
+ tmp1);
+ tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
+ 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ __lasx_xvst(dst2, dst_argb, 64);
+ __lasx_xvst(dst3, dst_argb, 96);
+ src_raw += 96;
+ dst_argb += 128;
+ }
+}
+
+void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m256i reg0, reg1, reg2, dst0;
+ __m256i const_66 = __lasx_xvldi(66);
+ __m256i const_129 = __lasx_xvldi(129);
+ __m256i const_25 = __lasx_xvldi(25);
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb1555, 0);
+ src1 = __lasx_xvld(src_argb1555, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpg = __lasx_xvsrli_b(tmp0, 5);
+ reg0 = __lasx_xvandi_b(tmp1, 0x03);
+ reg0 = __lasx_xvslli_b(reg0, 3);
+ tmpg = __lasx_xvor_v(tmpg, reg0);
+ reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+ tmpr = __lasx_xvsrli_b(reg1, 2);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvslli_b(tmpg, 3);
+ reg2 = __lasx_xvslli_b(tmpr, 3);
+ tmpb = __lasx_xvsrli_b(tmpb, 2);
+ tmpg = __lasx_xvsrli_b(tmpg, 2);
+ tmpr = __lasx_xvsrli_b(tmpr, 2);
+ tmpb = __lasx_xvor_v(reg0, tmpb);
+ tmpg = __lasx_xvor_v(reg1, tmpg);
+ tmpr = __lasx_xvor_v(reg2, tmpr);
+ reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lasx_xvpackod_b(reg1, reg0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_argb1555 += 64;
+ dst_y += 32;
+ }
+}
+
+void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i reg0, reg1, reg2, reg3, dst0;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
+ next_argb1555, 32, src0, src1, src2, src3);
+ DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ nexb = __lasx_xvandi_b(tmp2, 0x1F);
+ tmpg = __lasx_xvsrli_b(tmp0, 5);
+ nexg = __lasx_xvsrli_b(tmp2, 5);
+ reg0 = __lasx_xvandi_b(tmp1, 0x03);
+ reg2 = __lasx_xvandi_b(tmp3, 0x03);
+ reg0 = __lasx_xvslli_b(reg0, 3);
+ reg2 = __lasx_xvslli_b(reg2, 3);
+ tmpg = __lasx_xvor_v(tmpg, reg0);
+ nexg = __lasx_xvor_v(nexg, reg2);
+ reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+ reg3 = __lasx_xvandi_b(tmp3, 0x7C);
+ tmpr = __lasx_xvsrli_b(reg1, 2);
+ nexr = __lasx_xvsrli_b(reg3, 2);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvslli_b(tmpg, 3);
+ reg2 = __lasx_xvslli_b(tmpr, 3);
+ tmpb = __lasx_xvsrli_b(tmpb, 2);
+ tmpg = __lasx_xvsrli_b(tmpg, 2);
+ tmpr = __lasx_xvsrli_b(tmpr, 2);
+ tmpb = __lasx_xvor_v(reg0, tmpb);
+ tmpg = __lasx_xvor_v(reg1, tmpg);
+ tmpr = __lasx_xvor_v(reg2, tmpr);
+ reg0 = __lasx_xvslli_b(nexb, 3);
+ reg1 = __lasx_xvslli_b(nexg, 3);
+ reg2 = __lasx_xvslli_b(nexr, 3);
+ nexb = __lasx_xvsrli_b(nexb, 2);
+ nexg = __lasx_xvsrli_b(nexg, 2);
+ nexr = __lasx_xvsrli_b(nexr, 2);
+ nexb = __lasx_xvor_v(reg0, nexb);
+ nexg = __lasx_xvor_v(reg1, nexg);
+ nexr = __lasx_xvor_v(reg2, nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ reg0 = __lasx_xvpermi_d(reg0, 0xD8);
+ reg1 = __lasx_xvpermi_d(reg1, 0xD8);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ src_argb1555 += 64;
+ next_argb1555 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m256i reg0, reg1, dst0;
+ __m256i const_66 = __lasx_xvldi(66);
+ __m256i const_129 = __lasx_xvldi(129);
+ __m256i const_25 = __lasx_xvldi(25);
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_rgb565, 0);
+ src1 = __lasx_xvld(src_rgb565, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+ reg1 = __lasx_xvandi_b(tmp1, 0x07);
+ reg0 = __lasx_xvsrli_b(tmp0, 5);
+ reg1 = __lasx_xvslli_b(reg1, 3);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvsrli_b(tmpb, 2);
+ tmpb = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpg, 2);
+ reg1 = __lasx_xvsrli_b(tmpg, 4);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvsrli_b(tmpr, 5);
+ tmpr = __lasx_xvor_v(tmpr, reg0);
+ reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lasx_xvpackod_b(reg1, reg0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ dst_y += 32;
+ src_rgb565 += 64;
+ }
+}
+
+void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i reg0, reg1, reg2, reg3, dst0;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
+ next_rgb565, 32, src0, src1, src2, src3);
+ DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+ nexb = __lasx_xvandi_b(tmp2, 0x1F);
+ nexr = __lasx_xvandi_b(tmp3, 0xF8);
+ reg1 = __lasx_xvandi_b(tmp1, 0x07);
+ reg3 = __lasx_xvandi_b(tmp3, 0x07);
+ reg0 = __lasx_xvsrli_b(tmp0, 5);
+ reg1 = __lasx_xvslli_b(reg1, 3);
+ reg2 = __lasx_xvsrli_b(tmp2, 5);
+ reg3 = __lasx_xvslli_b(reg3, 3);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ nexg = __lasx_xvor_v(reg2, reg3);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvsrli_b(tmpb, 2);
+ reg2 = __lasx_xvslli_b(nexb, 3);
+ reg3 = __lasx_xvsrli_b(nexb, 2);
+ tmpb = __lasx_xvor_v(reg1, reg0);
+ nexb = __lasx_xvor_v(reg2, reg3);
+ reg0 = __lasx_xvslli_b(tmpg, 2);
+ reg1 = __lasx_xvsrli_b(tmpg, 4);
+ reg2 = __lasx_xvslli_b(nexg, 2);
+ reg3 = __lasx_xvsrli_b(nexg, 4);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ nexg = __lasx_xvor_v(reg2, reg3);
+ reg0 = __lasx_xvsrli_b(tmpr, 5);
+ reg2 = __lasx_xvsrli_b(nexr, 5);
+ tmpr = __lasx_xvor_v(tmpr, reg0);
+ nexr = __lasx_xvor_v(nexr, reg2);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ reg0 = __lasx_xvpermi_d(reg0, 0xD8);
+ reg1 = __lasx_xvpermi_d(reg1, 0xD8);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ dst_u += 16;
+ dst_v += 16;
+ src_rgb565 += 64;
+ next_rgb565 += 64;
+ }
+}
+
+void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
+ int len = width / 32;
+ __m256i src0, src1, src2, reg0, reg1, reg2;
+ __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
+ 0x15120F0C09060300, 0x00000000001E1B18};
+ __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
+ 0x0706050403020100, 0x1D1A1714110A0908};
+ __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
+ 0x1613100D0A070401, 0x00000000001F1C19};
+ __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
+ 0x0706050403020100, 0x1E1B1815120A0908};
+ __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A,
+ 0x1714110E0B080502, 0x0000000000001D1A};
+ __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908,
+ 0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
+ next_rgb24, 0, reg0, reg1, reg2, tmp0);
+ DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
+ 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
+ DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ src_rgb24 += 96;
+ next_rgb24 += 96;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void RAWToUVRow_LASX(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_raw = src_raw + src_stride_raw;
+ int len = width / 32;
+ __m256i src0, src1, src2, reg0, reg1, reg2;
+ __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
+ 0x15120F0C09060300, 0x00000000001E1B18};
+ __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
+ 0x0706050403020100, 0x1D1A1714110A0908};
+ __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
+ 0x1613100D0A070401, 0x00000000001F1C19};
+ __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
+ 0x0706050403020100, 0x1E1B1815120A0908};
+ __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A,
+ 0x1714110E0B080502, 0x0000000000001D1A};
+ __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908,
+ 0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0,
+ reg0, reg1, reg2, tmp0);
+ DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
+ 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
+ DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ src_raw += 96;
+ next_raw += 96;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void NV12ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
+ __m256i out_b, out_g, out_r;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = __lasx_xvldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lasx_xvld(src_y, 0);
+ vec_vu = __lasx_xvld(src_uv, 0);
+ vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
+ vec_vu = __lasx_vext2xv_h_b(vec_vu);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
+ out_b);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 16;
+ src_uv += 16;
+ }
+}
+
+void NV12ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
+ __m256i out_b, out_g, out_r;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lasx_xvld(src_y, 0);
+ vec_vu = __lasx_xvld(src_uv, 0);
+ vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
+ vec_vu = __lasx_vext2xv_h_b(vec_vu);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
+ out_b);
+ out_b = __lasx_xvsrli_h(out_b, 3);
+ out_g = __lasx_xvsrli_h(out_g, 2);
+ out_r = __lasx_xvsrli_h(out_r, 3);
+ out_g = __lasx_xvslli_h(out_g, 5);
+ out_r = __lasx_xvslli_h(out_r, 11);
+ out_r = __lasx_xvor_v(out_r, out_g);
+ out_r = __lasx_xvor_v(out_r, out_b);
+ __lasx_xvst(out_r, dst_rgb565, 0);
+ src_y += 16;
+ src_uv += 16;
+ dst_rgb565 += 32;
+ }
+}
+
+void NV21ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv;
+ __m256i out_b, out_g, out_r;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = __lasx_xvldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lasx_xvld(src_y, 0);
+ vec_uv = __lasx_xvld(src_uv, 0);
+ vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
+ vec_uv = __lasx_vext2xv_h_b(vec_uv);
+ YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 16;
+ src_uv += 16;
+ }
+}
+
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+ uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+ 128,
+ 0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080,
+ 0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+ 0x1080,
+ 0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+ asm volatile(
+ "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
+ "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
+ "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
+ "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants
+ "xvld $xr20, %4, 0 \n\t" // load shuff
+ "1: \n\t"
+ "xvld $xr4, %0, 0 \n\t"
+ "xvld $xr5, %0, 32 \n\t"
+ "xvld $xr6, %0, 64 \n\t"
+ "xvld $xr7, %0, 96 \n\t" // load 32 pixels of
+ // ARGB
+ "xvor.v $xr12, $xr3, $xr3 \n\t"
+ "xvor.v $xr13, $xr3, $xr3 \n\t"
+ "addi.d %2, %2, -32 \n\t" // 32 processed per
+ // loop.
+ "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR
+ "xvpickev.b $xr10, $xr7, $xr6 \n\t"
+ "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA
+ "xvpickod.b $xr11, $xr7, $xr6 \n\t"
+ "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B
+ "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t"
+ "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G
+ "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t"
+ "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R
+ "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t"
+ "addi.d %0, %0, 128 \n\t"
+ "xvpickod.b $xr10, $xr13, $xr12 \n\t"
+ "xvperm.w $xr11, $xr10, $xr20 \n\t"
+ "xvst $xr11, %1, 0 \n\t"
+ "addi.d %1, %1, 32 \n\t"
+ "bnez %2, 1b \n\t"
+ : "+&r"(src_argb), // %0
+ "+&r"(dst_y), // %1
+ "+&r"(width) // %2
+ : "r"(rgbconstants), "r"(shuff)
+ : "memory");
+}
+
+void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+ asm volatile(
+ "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
+ "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
+ "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
+ "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants
+ "xvld $xr20, %4, 0 \n\t" // load shuff
+ "1: \n\t"
+ "xvld $xr4, %0, 0 \n\t"
+ "xvld $xr5, %0, 32 \n\t"
+ "xvld $xr6, %0, 64 \n\t"
+ "xvld $xr7, %0, 96 \n\t" // load 32 pixels of
+ // RGBA
+ "xvor.v $xr12, $xr3, $xr3 \n\t"
+ "xvor.v $xr13, $xr3, $xr3 \n\t"
+ "addi.d %2, %2, -32 \n\t" // 32 processed per
+ // loop.
+ "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG
+ "xvpickev.b $xr10, $xr7, $xr6 \n\t"
+ "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR
+ "xvpickod.b $xr11, $xr7, $xr6 \n\t"
+ "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B
+ "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t"
+ "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G
+ "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t"
+ "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R
+ "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t"
+ "addi.d %0, %0, 128 \n\t"
+ "xvpickod.b $xr10, $xr13, $xr12 \n\t"
+ "xvperm.w $xr11, $xr10, $xr20 \n\t"
+ "xvst $xr11, %1, 0 \n\t"
+ "addi.d %1, %1, 32 \n\t"
+ "bnez %2, 1b \n\t"
+ : "+&r"(src_rgba), // %0
+ "+&r"(dst_y), // %1
+ "+&r"(width) // %2
+ : "r"(rgbconstants), "r"(shuff)
+ : "memory");
+}
+
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ int8_t shuff[128] = {
+ 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
+ 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
+ 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
+ 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
+ 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
+ 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
+ 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
+ 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
+ asm volatile(
+ "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
+ "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
+ "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
+ "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants
+ "xvld $xr4, %4, 0 \n\t" // load shuff
+ "xvld $xr5, %4, 32 \n\t"
+ "xvld $xr6, %4, 64 \n\t"
+ "xvld $xr7, %4, 96 \n\t"
+ "1: \n\t"
+ "xvld $xr8, %0, 0 \n\t"
+ "xvld $xr9, %0, 32 \n\t"
+ "xvld $xr10, %0, 64 \n\t" // load 32 pixels of
+ // RGB
+ "xvor.v $xr12, $xr3, $xr3 \n\t"
+ "xvor.v $xr13, $xr3, $xr3 \n\t"
+ "xvor.v $xr11, $xr9, $xr9 \n\t"
+ "addi.d %2, %2, -32 \n\t" // 32 processed per
+ // loop.
+ "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0
+ "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1
+ "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2
+ "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t"
+ "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t"
+ "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t"
+ "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t"
+ "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G
+ "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t"
+ "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B
+ "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t"
+ "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R
+ "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t"
+ "addi.d %0, %0, 96 \n\t"
+ "xvpickod.b $xr10, $xr13, $xr12 \n\t"
+ "xvst $xr10, %1, 0 \n\t"
+ "addi.d %1, %1, 32 \n\t"
+ "bnez %2, 1b \n\t"
+ : "+&r"(src_rgba), // %0
+ "+&r"(dst_y), // %1
+ "+&r"(width) // %2
+ : "r"(rgbconstants), // %3
+ "r"(shuff) // %4
+ : "memory");
+}
+
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+void ARGBToUVJRow_LASX(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_argb = src_argb + src_stride_argb;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i nex0, nex1, nex2, nex3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, dst0;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i const_63 = __lasx_xvldi(0x43F);
+ __m256i const_42 = __lasx_xvldi(0x42A);
+ __m256i const_21 = __lasx_xvldi(0x415);
+ __m256i const_53 = __lasx_xvldi(0x435);
+ __m256i const_10 = __lasx_xvldi(0x40A);
+ __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
+ 0x1F1D0F0D1B190B09};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64,
+ next_argb, 96, nex0, nex1, nex2, nex3);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp2 = __lasx_xvpickev_b(src3, src2);
+ tmp3 = __lasx_xvpickod_b(src3, src2);
+ tmpr = __lasx_xvpickod_b(tmp2, tmp0);
+ tmpb = __lasx_xvpickev_b(tmp2, tmp0);
+ tmpg = __lasx_xvpickev_b(tmp3, tmp1);
+ tmp0 = __lasx_xvpickev_b(nex1, nex0);
+ tmp1 = __lasx_xvpickod_b(nex1, nex0);
+ tmp2 = __lasx_xvpickev_b(nex3, nex2);
+ tmp3 = __lasx_xvpickod_b(nex3, nex2);
+ nexr = __lasx_xvpickod_b(tmp2, tmp0);
+ nexb = __lasx_xvpickev_b(tmp2, tmp0);
+ nexg = __lasx_xvpickev_b(tmp3, tmp1);
+ tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb);
+ tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb);
+ tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg);
+ tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
+ reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
+ reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
+ tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
+ tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
+ tmpr = __lasx_xvavgr_hu(reg0, reg1);
+ reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
+ reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
+ reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
+ reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
+ reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
+ reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
+ dst0 = __lasx_xvpackod_b(reg1, reg0);
+ tmp0 = __lasx_xvpermi_d(dst0, 0x44);
+ tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
+ dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 2);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 1);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ dst_u += 16;
+ dst_v += 16;
+ src_argb += 128;
+ next_argb += 128;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
diff --git a/source/row_lsx.cc b/source/row_lsx.cc
new file mode 100644
index 00000000..fa088c9e
--- /dev/null
+++ b/source/row_lsx.cc
@@ -0,0 +1,2987 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \
+ { \
+ ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \
+ vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \
+ ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \
+ vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \
+ yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \
+ yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
+ }
+
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+ { \
+ __m128i temp0, temp1; \
+ \
+ DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
+ temp1 = __lsx_vld(psrc_v, 0); \
+ temp0 = __lsx_vsub_b(temp0, const_80); \
+ temp1 = __lsx_vsub_b(temp1, const_80); \
+ temp0 = __lsx_vsllwil_h_b(temp0, 0); \
+ temp1 = __lsx_vsllwil_h_b(temp1, 0); \
+ uv_l = __lsx_vilvl_h(temp0, temp1); \
+ uv_h = __lsx_vilvh_h(temp0, temp1); \
+ }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+ { \
+ __m128i temp0, temp1; \
+ \
+ out_y = __lsx_vld(psrc_y, 0); \
+ temp0 = __lsx_vldrepl_d(psrc_u, 0); \
+ temp1 = __lsx_vldrepl_d(psrc_v, 0); \
+ uv = __lsx_vilvl_b(temp0, temp1); \
+ uv = __lsx_vsub_b(uv, const_80); \
+ uv = __lsx_vsllwil_h_b(uv, 0); \
+ }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
+ g_h, r_l, r_h) \
+ { \
+ __m128i u_l, u_h, v_l, v_h; \
+ __m128i yl_ev, yl_od, yh_ev, yh_od; \
+ __m128i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lsx_vilvl_b(in_y, in_y); \
+ temp1 = __lsx_vilvh_b(in_y, in_y); \
+ yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \
+ yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \
+ yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \
+ yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \
+ DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
+ yl_ev, yl_od, yh_ev, yh_od); \
+ yl_ev = __lsx_vadd_w(yl_ev, yb); \
+ yl_od = __lsx_vadd_w(yl_od, yb); \
+ yh_ev = __lsx_vadd_w(yh_ev, yb); \
+ yh_od = __lsx_vadd_w(yh_od, yb); \
+ v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \
+ u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \
+ v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \
+ u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \
+ temp0 = __lsx_vadd_w(yl_ev, u_l); \
+ temp1 = __lsx_vadd_w(yl_od, u_l); \
+ temp2 = __lsx_vadd_w(yh_ev, u_h); \
+ temp3 = __lsx_vadd_w(yh_od, u_h); \
+ DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ b_l = __lsx_vpackev_h(temp1, temp0); \
+ b_h = __lsx_vpackev_h(temp3, temp2); \
+ temp0 = __lsx_vadd_w(yl_ev, v_l); \
+ temp1 = __lsx_vadd_w(yl_od, v_l); \
+ temp2 = __lsx_vadd_w(yh_ev, v_h); \
+ temp3 = __lsx_vadd_w(yh_od, v_h); \
+ DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ r_l = __lsx_vpackev_h(temp1, temp0); \
+ r_h = __lsx_vpackev_h(temp3, temp2); \
+ DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
+ temp0 = __lsx_vsub_w(yl_ev, u_l); \
+ temp1 = __lsx_vsub_w(yl_od, u_l); \
+ temp2 = __lsx_vsub_w(yh_ev, u_h); \
+ temp3 = __lsx_vsub_w(yh_od, u_h); \
+ DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ g_l = __lsx_vpackev_h(temp1, temp0); \
+ g_h = __lsx_vpackev_h(temp3, temp2); \
+ }
+
+// Convert 8 pixels of YUV420 to RGB.
+#define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
+ { \
+ __m128i y_ev, y_od, u_l, v_l; \
+ __m128i tmp0, tmp1, tmp2, tmp3; \
+ \
+ tmp0 = __lsx_vilvl_b(in_y, in_y); \
+ y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \
+ y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \
+ y_ev = __lsx_vsrai_w(y_ev, 16); \
+ y_od = __lsx_vsrai_w(y_od, 16); \
+ y_ev = __lsx_vadd_w(y_ev, yb); \
+ y_od = __lsx_vadd_w(y_od, yb); \
+ in_vu = __lsx_vilvl_b(zero, in_vu); \
+ in_vu = __lsx_vsub_h(in_vu, const_80); \
+ u_l = __lsx_vmulwev_w_h(in_vu, vrub); \
+ v_l = __lsx_vmulwod_w_h(in_vu, vrub); \
+ tmp0 = __lsx_vadd_w(y_ev, u_l); \
+ tmp1 = __lsx_vadd_w(y_od, u_l); \
+ tmp2 = __lsx_vadd_w(y_ev, v_l); \
+ tmp3 = __lsx_vadd_w(y_od, v_l); \
+ tmp0 = __lsx_vsrai_w(tmp0, 6); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp2 = __lsx_vsrai_w(tmp2, 6); \
+ tmp3 = __lsx_vsrai_w(tmp3, 6); \
+ tmp0 = __lsx_vclip255_w(tmp0); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ tmp2 = __lsx_vclip255_w(tmp2); \
+ tmp3 = __lsx_vclip255_w(tmp3); \
+ out_b = __lsx_vpackev_h(tmp1, tmp0); \
+ out_r = __lsx_vpackev_h(tmp3, tmp2); \
+ tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \
+ tmp1 = __lsx_vsub_w(y_ev, tmp0); \
+ tmp2 = __lsx_vsub_w(y_od, tmp0); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp2 = __lsx_vsrai_w(tmp2, 6); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ tmp2 = __lsx_vclip255_w(tmp2); \
+ out_g = __lsx_vpackev_h(tmp2, tmp1); \
+ }
+
+// Convert I444 pixels of YUV420 to RGB.
+#define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \
+ out_r) \
+ { \
+ __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \
+ __m128i tmp0, tmp1, tmp2, tmp3; \
+ \
+ y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \
+ y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \
+ y_ev = __lsx_vsrai_w(y_ev, 16); \
+ y_od = __lsx_vsrai_w(y_od, 16); \
+ y_ev = __lsx_vadd_w(y_ev, yb); \
+ y_od = __lsx_vadd_w(y_od, yb); \
+ in_u = __lsx_vsub_h(in_u, const_80); \
+ in_v = __lsx_vsub_h(in_v, const_80); \
+ u_ev = __lsx_vmulwev_w_h(in_u, ub); \
+ u_od = __lsx_vmulwod_w_h(in_u, ub); \
+ v_ev = __lsx_vmulwev_w_h(in_v, vr); \
+ v_od = __lsx_vmulwod_w_h(in_v, vr); \
+ tmp0 = __lsx_vadd_w(y_ev, u_ev); \
+ tmp1 = __lsx_vadd_w(y_od, u_od); \
+ tmp2 = __lsx_vadd_w(y_ev, v_ev); \
+ tmp3 = __lsx_vadd_w(y_od, v_od); \
+ tmp0 = __lsx_vsrai_w(tmp0, 6); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp2 = __lsx_vsrai_w(tmp2, 6); \
+ tmp3 = __lsx_vsrai_w(tmp3, 6); \
+ tmp0 = __lsx_vclip255_w(tmp0); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ tmp2 = __lsx_vclip255_w(tmp2); \
+ tmp3 = __lsx_vclip255_w(tmp3); \
+ out_b = __lsx_vpackev_h(tmp1, tmp0); \
+ out_r = __lsx_vpackev_h(tmp3, tmp2); \
+ u_ev = __lsx_vpackev_h(in_u, in_v); \
+ u_od = __lsx_vpackod_h(in_u, in_v); \
+ v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \
+ v_od = __lsx_vdp2_w_h(u_od, ugvg); \
+ tmp0 = __lsx_vsub_w(y_ev, v_ev); \
+ tmp1 = __lsx_vsub_w(y_od, v_od); \
+ tmp0 = __lsx_vsrai_w(tmp0, 6); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp0 = __lsx_vclip255_w(tmp0); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ out_g = __lsx_vpackev_h(tmp1, tmp0); \
+ }
+
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+ { \
+ __m128i temp0, temp1, temp2, temp3; \
+ temp0 = __lsx_vpackev_b(g_l, b_l); \
+ temp1 = __lsx_vpackev_b(a_l, r_l); \
+ temp2 = __lsx_vpackev_b(g_h, b_h); \
+ temp3 = __lsx_vpackev_b(a_h, r_h); \
+ r_l = __lsx_vilvl_h(temp1, temp0); \
+ r_h = __lsx_vilvh_h(temp1, temp0); \
+ g_l = __lsx_vilvl_h(temp3, temp2); \
+ g_h = __lsx_vilvh_h(temp3, temp2); \
+ __lsx_vst(r_l, pdst_argb, 0); \
+ __lsx_vst(r_h, pdst_argb, 16); \
+ __lsx_vst(g_l, pdst_argb, 32); \
+ __lsx_vst(g_h, pdst_argb, 48); \
+ pdst_argb += 64; \
+ }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+ { \
+ __m128i temp0, temp1; \
+ __m128i dst0, dst1; \
+ \
+ temp0 = __lsx_vpackev_b(in_g, in_b); \
+ temp1 = __lsx_vpackev_b(in_a, in_r); \
+ dst0 = __lsx_vilvl_h(temp1, temp0); \
+ dst1 = __lsx_vilvh_h(temp1, temp0); \
+ __lsx_vst(dst0, pdst_argb, 0); \
+ __lsx_vst(dst1, pdst_argb, 16); \
+ pdst_argb += 32; \
+ }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+ { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
+ __m128i _reg0, _reg1; \
+ _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \
+ _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \
+ _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \
+ _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \
+ _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \
+ _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \
+ _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \
+ _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \
+ _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \
+ _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \
+ _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \
+ _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \
+ _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \
+ _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \
+ _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \
+ _dst0 = __lsx_vpickod_b(_reg1, _reg0); \
+ }
+
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1;
+ __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607};
+ src += width - 32;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+ src1);
+ __lsx_vst(src1, dst, 0);
+ __lsx_vst(src0, dst, 16);
+ dst += 32;
+ src -= 32;
+ }
+}
+
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ int len = width / 8;
+ __m128i src, dst;
+ __m128i shuffler = {0x0004000500060007, 0x0000000100020003};
+
+ src_uv += (width - 8) << 1;
+ for (x = 0; x < len; x++) {
+ src = __lsx_vld(src_uv, 0);
+ dst = __lsx_vshuf_h(shuffler, src, src);
+ __lsx_vst(dst, dst_uv, 0);
+ src_uv -= 16;
+ dst_uv += 16;
+ }
+}
+
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1;
+ __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504};
+
+ src += (width * 4) - 32;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+ src1);
+ __lsx_vst(src1, dst, 0);
+ __lsx_vst(src0, dst, 16);
+ dst += 32;
+ src -= 32;
+ }
+}
+
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src_u0, src_v0, src_y0, vec_uv0;
+ __m128i vec_yuy2_0, vec_yuy2_1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lsx_vld(src_y, 0);
+ vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+ vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0);
+ vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0);
+ __lsx_vst(vec_yuy2_0, dst_yuy2, 0);
+ __lsx_vst(vec_yuy2_1, dst_yuy2, 16);
+ src_u += 8;
+ src_v += 8;
+ src_y += 16;
+ dst_yuy2 += 32;
+ }
+}
+
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src_u0, src_v0, src_y0, vec_uv0;
+ __m128i vec_uyvy0, vec_uyvy1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lsx_vld(src_y, 0);
+ vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+ vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0);
+ vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0);
+ __lsx_vst(vec_uyvy0, dst_uyvy, 0);
+ __lsx_vst(vec_uyvy1, dst_uyvy, 16);
+ src_u += 8;
+ src_v += 8;
+ src_y += 16;
+ dst_uyvy += 32;
+ }
+}
+
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i const_80 = __lsx_vldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+}
+
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i const_80 = __lsx_vldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+}
+
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ int res = width & 15;
+ __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i zero = __lsx_vldi(0);
+ __m128i const_80 = __lsx_vldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+ y = __lsx_vld(src_a, 0);
+ a_l = __lsx_vilvl_b(zero, y);
+ a_h = __lsx_vilvh_b(zero, y);
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ src_a += 16;
+ }
+ if (res) {
+ __m128i y, uv, r, g, b, a;
+ a = __lsx_vld(src_a, 0);
+ a = __lsx_vsllwil_hu_bu(a, 0);
+ READYUV422(src_y, src_u, src_v, y, uv);
+ YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+ STOREARGB(a, r, g, b, dst_argb);
+ }
+}
+
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int32_t width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i const_80 = __lsx_vldi(0x80);
+ __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614};
+ __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m128i temp0, temp1, temp2, temp3;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ temp0 = __lsx_vpackev_b(g_l, b_l);
+ temp1 = __lsx_vpackev_b(g_h, b_h);
+ DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l,
+ temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+ temp1);
+
+ b_l = __lsx_vilvl_d(temp1, temp2);
+ b_h = __lsx_vilvh_d(temp3, temp1);
+ __lsx_vst(temp0, dst_argb, 0);
+ __lsx_vst(b_l, dst_argb, 16);
+ __lsx_vst(b_h, dst_argb, 32);
+ dst_argb += 48;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i const_80 = __lsx_vldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lsx_vsrli_h(b_l, 3);
+ b_h = __lsx_vsrli_h(b_h, 3);
+ g_l = __lsx_vsrli_h(g_l, 2);
+ g_h = __lsx_vsrli_h(g_h, 2);
+ r_l = __lsx_vsrli_h(r_l, 3);
+ r_h = __lsx_vsrli_h(r_h, 3);
+ r_l = __lsx_vslli_h(r_l, 11);
+ r_h = __lsx_vslli_h(r_h, 11);
+ g_l = __lsx_vslli_h(g_l, 5);
+ g_h = __lsx_vslli_h(g_h, 5);
+ r_l = __lsx_vor_v(r_l, g_l);
+ r_l = __lsx_vor_v(r_l, b_l);
+ r_h = __lsx_vor_v(r_h, g_h);
+ r_h = __lsx_vor_v(r_h, b_h);
+ __lsx_vst(r_l, dst_rgb565, 0);
+ __lsx_vst(r_h, dst_rgb565, 16);
+ dst_rgb565 += 32;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i const_80 = __lsx_vldi(0x80);
+ __m128i alpha = (__m128i)v2u64{0xF000F000F000F000, 0xF000F000F000F000};
+ __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lsx_vsrli_h(b_l, 4);
+ b_h = __lsx_vsrli_h(b_h, 4);
+ r_l = __lsx_vsrli_h(r_l, 4);
+ r_h = __lsx_vsrli_h(r_h, 4);
+ g_l = __lsx_vand_v(g_l, mask);
+ g_h = __lsx_vand_v(g_h, mask);
+ r_l = __lsx_vslli_h(r_l, 8);
+ r_h = __lsx_vslli_h(r_h, 8);
+ r_l = __lsx_vor_v(r_l, alpha);
+ r_h = __lsx_vor_v(r_h, alpha);
+ r_l = __lsx_vor_v(r_l, g_l);
+ r_h = __lsx_vor_v(r_h, g_h);
+ r_l = __lsx_vor_v(r_l, b_l);
+ r_h = __lsx_vor_v(r_h, b_h);
+ __lsx_vst(r_l, dst_argb4444, 0);
+ __lsx_vst(r_h, dst_argb4444, 16);
+ dst_argb4444 += 32;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+}
+
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i const_80 = __lsx_vldi(0x80);
+ __m128i alpha = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lsx_vsrli_h(b_l, 3);
+ b_h = __lsx_vsrli_h(b_h, 3);
+ g_l = __lsx_vsrli_h(g_l, 3);
+
+ g_h = __lsx_vsrli_h(g_h, 3);
+ g_l = __lsx_vslli_h(g_l, 5);
+ g_h = __lsx_vslli_h(g_h, 5);
+ r_l = __lsx_vsrli_h(r_l, 3);
+ r_h = __lsx_vsrli_h(r_h, 3);
+ r_l = __lsx_vslli_h(r_l, 10);
+ r_h = __lsx_vslli_h(r_h, 10);
+ r_l = __lsx_vor_v(r_l, alpha);
+ r_h = __lsx_vor_v(r_h, alpha);
+ r_l = __lsx_vor_v(r_l, g_l);
+ r_h = __lsx_vor_v(r_h, g_h);
+ r_l = __lsx_vor_v(r_l, b_l);
+ r_h = __lsx_vor_v(r_h, b_h);
+ __lsx_vst(r_l, dst_argb1555, 0);
+ __lsx_vst(r_h, dst_argb1555, 16);
+ dst_argb1555 += 32;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+}
+
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+ dst0 = __lsx_vpickev_b(src1, src0);
+ __lsx_vst(dst0, dst_y, 0);
+ src_yuy2 += 32;
+ dst_y += 16;
+ }
+}
+
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0,
+ src_yuy2_next, 16, src0, src1, src2, src3);
+ src0 = __lsx_vpickod_b(src1, src0);
+ src1 = __lsx_vpickod_b(src3, src2);
+ tmp0 = __lsx_vavgr_bu(src1, src0);
+ dst0 = __lsx_vpickev_b(tmp0, tmp0);
+ dst1 = __lsx_vpickod_b(tmp0, tmp0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst1, dst_v, 0, 0);
+ src_yuy2 += 32;
+ src_yuy2_next += 32;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ dst0 = __lsx_vpickev_b(tmp0, tmp0);
+ dst1 = __lsx_vpickod_b(tmp0, tmp0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst1, dst_v, 0, 0);
+ src_yuy2 += 32;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+ dst0 = __lsx_vpickod_b(src1, src0);
+ __lsx_vst(dst0, dst_y, 0);
+ src_uyvy += 32;
+ dst_y += 16;
+ }
+}
+
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0,
+ src_uyvy_next, 16, src0, src1, src2, src3);
+ src0 = __lsx_vpickev_b(src1, src0);
+ src1 = __lsx_vpickev_b(src3, src2);
+ tmp0 = __lsx_vavgr_bu(src1, src0);
+ dst0 = __lsx_vpickev_b(tmp0, tmp0);
+ dst1 = __lsx_vpickod_b(tmp0, tmp0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst1, dst_v, 0, 0);
+ src_uyvy += 32;
+ src_uyvy_next += 32;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ dst0 = __lsx_vpickev_b(tmp0, tmp0);
+ dst1 = __lsx_vpickod_b(tmp0, tmp0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst1, dst_v, 0, 0);
+ src_uyvy += 32;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void ARGBToUVRow_LSX(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+ __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
+ __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
+ __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
+ __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
+ __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
+ __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
+ 48, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
+ 48, src4, src5, src6, src7);
+ vec0 = __lsx_vaddwev_h_bu(src0, src4);
+ vec1 = __lsx_vaddwev_h_bu(src1, src5);
+ vec2 = __lsx_vaddwev_h_bu(src2, src6);
+ vec3 = __lsx_vaddwev_h_bu(src3, src7);
+ tmp0 = __lsx_vpickev_h(vec1, vec0);
+ tmp1 = __lsx_vpickev_h(vec3, vec2);
+ tmp2 = __lsx_vpickod_h(vec1, vec0);
+ tmp3 = __lsx_vpickod_h(vec3, vec2);
+ vec0 = __lsx_vaddwod_h_bu(src0, src4);
+ vec1 = __lsx_vaddwod_h_bu(src1, src5);
+ vec2 = __lsx_vaddwod_h_bu(src2, src6);
+ vec3 = __lsx_vaddwod_h_bu(src3, src7);
+ tmp4 = __lsx_vpickev_h(vec1, vec0);
+ tmp5 = __lsx_vpickev_h(vec3, vec2);
+ vec0 = __lsx_vpickev_h(tmp1, tmp0);
+ vec1 = __lsx_vpickod_h(tmp1, tmp0);
+ src0 = __lsx_vavgr_h(vec0, vec1);
+ vec0 = __lsx_vpickev_h(tmp3, tmp2);
+ vec1 = __lsx_vpickod_h(tmp3, tmp2);
+ src1 = __lsx_vavgr_h(vec0, vec1);
+ vec0 = __lsx_vpickev_h(tmp5, tmp4);
+ vec1 = __lsx_vpickod_h(tmp5, tmp4);
+ src2 = __lsx_vavgr_h(vec0, vec1);
+ dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
+ dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
+ dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
+ dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
+ dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
+ dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
+ dst0 = __lsx_vsrai_h(dst0, 8);
+ dst1 = __lsx_vsrai_h(dst1, 8);
+ dst0 = __lsx_vpickev_b(dst1, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ src_argb0 += 64;
+ src_argb1 += 64;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 16) - 1;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+ tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+ tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+ tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+ __lsx_vst(tmp0, dst_rgb, 0);
+ __lsx_vst(tmp1, dst_rgb, 12);
+ __lsx_vst(tmp2, dst_rgb, 24);
+ __lsx_vst(tmp3, dst_rgb, 36);
+ dst_rgb += 48;
+ src_argb += 64;
+ }
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+ tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+ tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+ tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+ __lsx_vst(tmp0, dst_rgb, 0);
+ __lsx_vst(tmp1, dst_rgb, 12);
+ __lsx_vst(tmp2, dst_rgb, 24);
+ dst_rgb += 36;
+ __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 16) - 1;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+ tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+ tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+ tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+ __lsx_vst(tmp0, dst_rgb, 0);
+ __lsx_vst(tmp1, dst_rgb, 12);
+ __lsx_vst(tmp2, dst_rgb, 24);
+ __lsx_vst(tmp3, dst_rgb, 36);
+ dst_rgb += 48;
+ src_argb += 64;
+ }
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+ tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+ tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+ tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+ __lsx_vst(tmp0, dst_rgb, 0);
+ __lsx_vst(tmp1, dst_rgb, 12);
+ __lsx_vst(tmp2, dst_rgb, 24);
+ dst_rgb += 36;
+ __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = width / 8;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, tmp0, tmp1, dst0;
+ __m128i shift = {0x0300030003000300, 0x0300030003000300};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp0 = __lsx_vsrli_b(tmp0, 3);
+ tmp1 = __lsx_vpackev_b(zero, tmp1);
+ tmp1 = __lsx_vsrli_h(tmp1, 2);
+ tmp0 = __lsx_vsll_b(tmp0, shift);
+ tmp1 = __lsx_vslli_h(tmp1, 5);
+ dst0 = __lsx_vor_v(tmp0, tmp1);
+ __lsx_vst(dst0, dst_rgb, 0);
+ dst_rgb += 16;
+ src_argb += 32;
+ }
+}
+
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i shift1 = {0x0703070307030703, 0x0703070307030703};
+ __m128i shift2 = {0x0200020002000200, 0x0200020002000200};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp0 = __lsx_vsrli_b(tmp0, 3);
+ tmp1 = __lsx_vsrl_b(tmp1, shift1);
+ tmp0 = __lsx_vsll_b(tmp0, shift2);
+ tmp2 = __lsx_vpackev_b(zero, tmp1);
+ tmp3 = __lsx_vpackod_b(zero, tmp1);
+ tmp2 = __lsx_vslli_h(tmp2, 5);
+ tmp3 = __lsx_vslli_h(tmp3, 15);
+ dst0 = __lsx_vor_v(tmp0, tmp2);
+ dst0 = __lsx_vor_v(dst0, tmp3);
+ __lsx_vst(dst0, dst_rgb, 0);
+ dst_rgb += 16;
+ src_argb += 32;
+ }
+}
+
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vandi_b(tmp1, 0xF0);
+ tmp0 = __lsx_vsrli_b(tmp0, 4);
+ dst0 = __lsx_vor_v(tmp1, tmp0);
+ __lsx_vst(dst0, dst_rgb, 0);
+ dst_rgb += 16;
+ src_argb += 32;
+ }
+}
+
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int32_t width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, dst0, dst1;
+ __m128i const_112 = __lsx_vldi(112);
+ __m128i const_74 = __lsx_vldi(74);
+ __m128i const_38 = __lsx_vldi(38);
+ __m128i const_94 = __lsx_vldi(94);
+ __m128i const_18 = __lsx_vldi(18);
+ __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickev_h(src1, src0);
+ tmp1 = __lsx_vpickod_h(src1, src0);
+ tmp2 = __lsx_vpickev_h(src3, src2);
+ tmp3 = __lsx_vpickod_h(src3, src2);
+ reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
+ reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
+ reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
+ reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
+ reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
+ reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38);
+ reg0 = __lsx_vsub_h(reg0, reg2);
+ reg1 = __lsx_vsub_h(reg1, reg3);
+ reg0 = __lsx_vsrai_h(reg0, 8);
+ reg1 = __lsx_vsrai_h(reg1, 8);
+ dst0 = __lsx_vpickev_b(reg1, reg0);
+
+ reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
+ reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
+ reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
+ reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
+ reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
+ reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94);
+ reg0 = __lsx_vsub_h(reg0, reg2);
+ reg1 = __lsx_vsub_h(reg1, reg3);
+ reg0 = __lsx_vsrai_h(reg0, 8);
+ reg1 = __lsx_vsrai_h(reg1, 8);
+ dst1 = __lsx_vpickev_b(reg1, reg0);
+
+ __lsx_vst(dst0, dst_u, 0);
+ __lsx_vst(dst1, dst_v, 0);
+ dst_u += 16;
+ dst_v += 16;
+ src_argb += 64;
+ }
+}
+
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 4;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, dst0, dst1;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+ tmp0 = __lsx_vilvl_b(src0, src0);
+ tmp1 = __lsx_vilvh_b(src0, src0);
+ tmp2 = __lsx_vilvl_b(zero, src1);
+ tmp3 = __lsx_vilvh_b(zero, src1);
+ dst0 = __lsx_vmuh_hu(tmp0, tmp2);
+ dst1 = __lsx_vmuh_hu(tmp1, tmp3);
+ dst0 = __lsx_vpickev_b(dst1, dst0);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb0 += 16;
+ src_argb1 += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 4;
+ __m128i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lsx_vsadd_bu(src0, src1);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb0 += 16;
+ src_argb1 += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 4;
+ __m128i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lsx_vssub_bu(src0, src1);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb0 += 16;
+ src_argb1 += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i b, g, r, a, dst0, dst1;
+ __m128i control = {0x0005000100040000, 0x0007000300060002};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ b = __lsx_vpackev_b(tmp0, tmp0);
+ r = __lsx_vpackod_b(tmp0, tmp0);
+ g = __lsx_vpackev_b(tmp1, tmp1);
+ a = __lsx_vpackod_b(tmp1, tmp1);
+ reg0 = __lsx_vmulwev_w_hu(b, a);
+ reg1 = __lsx_vmulwod_w_hu(b, a);
+ reg2 = __lsx_vmulwev_w_hu(r, a);
+ reg3 = __lsx_vmulwod_w_hu(r, a);
+ reg4 = __lsx_vmulwev_w_hu(g, a);
+ reg5 = __lsx_vmulwod_w_hu(g, a);
+ reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
+ reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
+ reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
+ reg0 = __lsx_vshuf_h(control, reg0, reg0);
+ reg2 = __lsx_vshuf_h(control, reg2, reg2);
+ reg4 = __lsx_vshuf_h(control, reg4, reg4);
+ tmp0 = __lsx_vpackev_b(reg4, reg0);
+ tmp1 = __lsx_vpackev_b(a, reg2);
+ dst0 = __lsx_vilvl_h(tmp1, tmp0);
+ dst1 = __lsx_vilvh_h(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ dst_argb += 32;
+ src_argb += 32;
+ }
+}
+
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1, dst0;
+ __m128i b, g, r;
+ __m128i zero = __lsx_vldi(0);
+ __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0);
+
+ vec_dither = __lsx_vilvl_b(zero, vec_dither);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ b = __lsx_vpackev_b(zero, tmp0);
+ r = __lsx_vpackod_b(zero, tmp0);
+ g = __lsx_vpackev_b(zero, tmp1);
+ b = __lsx_vadd_h(b, vec_dither);
+ g = __lsx_vadd_h(g, vec_dither);
+ r = __lsx_vadd_h(r, vec_dither);
+ DUP2_ARG1(__lsx_vclip255_h, b, g, b, g);
+ r = __lsx_vclip255_h(r);
+ b = __lsx_vsrai_h(b, 3);
+ g = __lsx_vsrai_h(g, 2);
+ r = __lsx_vsrai_h(r, 3);
+ g = __lsx_vslli_h(g, 5);
+ r = __lsx_vslli_h(r, 11);
+ dst0 = __lsx_vor_v(b, g);
+ dst0 = __lsx_vor_v(dst0, r);
+ __lsx_vst(dst0, dst_rgb, 0);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, dst0, dst1;
+ __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808};
+ __m128i temp = __lsx_vldrepl_w(shuffler, 0);
+
+ shuf = __lsx_vadd_b(shuf, temp);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ dst0 = __lsx_vshuf_b(src0, src0, shuf);
+ dst1 = __lsx_vshuf_b(src1, src1, shuf);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ int x;
+ int len = width / 4;
+ __m128i src0, dst0, tmp0, tmp1;
+ __m128i vec_value = __lsx_vreplgr2vr_w(value);
+
+ vec_value = __lsx_vilvl_b(vec_value, vec_value);
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb, 0);
+ tmp0 = __lsx_vilvl_b(src0, src0);
+ tmp1 = __lsx_vilvh_b(src0, src0);
+ tmp0 = __lsx_vmuh_hu(tmp0, vec_value);
+ tmp1 = __lsx_vmuh_hu(tmp1, vec_value);
+ dst0 = __lsx_vpickod_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1;
+ __m128i reg0, reg1, reg2, dst0, dst1;
+ __m128i const_128 = __lsx_vldi(0x480);
+ __m128i const_150 = __lsx_vldi(0x96);
+ __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ reg0 = __lsx_vdp2_h_bu(tmp0, const_br);
+ reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
+ reg2 = __lsx_vadd_h(reg0, reg1);
+ tmp0 = __lsx_vpackod_b(reg2, reg2);
+ tmp1 = __lsx_vpackod_b(tmp1, reg2);
+ dst0 = __lsx_vilvl_h(tmp1, tmp0);
+ dst1 = __lsx_vilvh_h(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1;
+ __m128i reg0, reg1, spb, spg, spr;
+ __m128i dst0, dst1;
+ __m128i spb_g = __lsx_vldi(68);
+ __m128i spg_g = __lsx_vldi(88);
+ __m128i spr_g = __lsx_vldi(98);
+ __m128i spb_br = {0x2311231123112311, 0x2311231123112311};
+ __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16};
+ __m128i spr_br = {0x3218321832183218, 0x3218321832183218};
+ __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+ spr = __lsx_vdp2_h_bu(tmp0, spr_br);
+ spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g);
+ spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g);
+ spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g);
+ spb = __lsx_vsrli_h(spb, 7);
+ spg = __lsx_vsrli_h(spg, 7);
+ spr = __lsx_vsrli_h(spr, 7);
+ spg = __lsx_vsat_hu(spg, 7);
+ spr = __lsx_vsat_hu(spr, 7);
+ reg0 = __lsx_vpackev_b(spg, spb);
+ reg1 = __lsx_vshuf_b(tmp1, spr, shuff);
+ dst0 = __lsx_vilvl_h(reg1, reg0);
+ dst1 = __lsx_vilvh_h(reg1, reg0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ dst_argb += 32;
+ }
+}
+
+void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb4444, 0);
+ src1 = __lsx_vld(src_argb4444, 16);
+ tmp0 = __lsx_vandi_b(src0, 0x0F);
+ tmp1 = __lsx_vandi_b(src0, 0xF0);
+ tmp2 = __lsx_vandi_b(src1, 0x0F);
+ tmp3 = __lsx_vandi_b(src1, 0xF0);
+ reg0 = __lsx_vslli_b(tmp0, 4);
+ reg2 = __lsx_vslli_b(tmp2, 4);
+ reg1 = __lsx_vsrli_b(tmp1, 4);
+ reg3 = __lsx_vsrli_b(tmp3, 4);
+ DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0,
+ tmp1, tmp2, tmp3);
+ dst0 = __lsx_vilvl_b(tmp1, tmp0);
+ dst2 = __lsx_vilvl_b(tmp3, tmp2);
+ dst1 = __lsx_vilvh_b(tmp1, tmp0);
+ dst3 = __lsx_vilvh_b(tmp3, tmp2);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_argb4444 += 32;
+ }
+}
+
+void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
+ __m128i reg0, reg1, reg2;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb1555, 0);
+ src1 = __lsx_vld(src_argb1555, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpg = __lsx_vsrli_b(tmp0, 5);
+ reg0 = __lsx_vandi_b(tmp1, 0x03);
+ reg0 = __lsx_vslli_b(reg0, 3);
+ tmpg = __lsx_vor_v(tmpg, reg0);
+ reg1 = __lsx_vandi_b(tmp1, 0x7C);
+ tmpr = __lsx_vsrli_b(reg1, 2);
+ tmpa = __lsx_vsrli_b(tmp1, 7);
+ tmpa = __lsx_vneg_b(tmpa);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vslli_b(tmpg, 3);
+ reg2 = __lsx_vslli_b(tmpr, 3);
+ tmpb = __lsx_vsrli_b(tmpb, 2);
+ tmpg = __lsx_vsrli_b(tmpg, 2);
+ tmpr = __lsx_vsrli_b(tmpr, 2);
+ tmpb = __lsx_vor_v(reg0, tmpb);
+ tmpg = __lsx_vor_v(reg1, tmpg);
+ tmpr = __lsx_vor_v(reg2, tmpr);
+ DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+ dst0 = __lsx_vilvl_h(reg1, reg0);
+ dst1 = __lsx_vilvh_h(reg1, reg0);
+ DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+ dst2 = __lsx_vilvl_h(reg1, reg0);
+ dst3 = __lsx_vilvh_h(reg1, reg0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_argb1555 += 32;
+ }
+}
+
+void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m128i reg0, reg1, dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb565, 0);
+ src1 = __lsx_vld(src_rgb565, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpr = __lsx_vandi_b(tmp1, 0xF8);
+ reg1 = __lsx_vandi_b(tmp1, 0x07);
+ reg0 = __lsx_vsrli_b(tmp0, 5);
+ reg1 = __lsx_vslli_b(reg1, 3);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vsrli_b(tmpb, 2);
+ tmpb = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpg, 2);
+ reg1 = __lsx_vsrli_b(tmpg, 4);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vsrli_b(tmpr, 5);
+ tmpr = __lsx_vor_v(tmpr, reg0);
+ DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst0 = __lsx_vilvl_h(reg1, reg0);
+ dst1 = __lsx_vilvh_h(reg1, reg0);
+ DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst2 = __lsx_vilvl_h(reg1, reg0);
+ dst3 = __lsx_vilvh_h(reg1, reg0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_rgb565 += 32;
+ }
+}
+
+void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i tmp0, tmp1, tmp2;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
+ __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
+ __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
+ __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb24, 0);
+ src1 = __lsx_vld(src_rgb24, 16);
+ src2 = __lsx_vld(src_rgb24, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
+ tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_rgb24 += 48;
+ }
+}
+
+void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i tmp0, tmp1, tmp2;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
+ __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
+ __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
+ __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_raw, 0);
+ src1 = __lsx_vld(src_raw, 16);
+ src2 = __lsx_vld(src_raw, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
+ tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_raw += 48;
+ }
+}
+
+void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m128i reg0, reg1, reg2, dst0;
+ __m128i const_66 = __lsx_vldi(66);
+ __m128i const_129 = __lsx_vldi(129);
+ __m128i const_25 = __lsx_vldi(25);
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb1555, 0);
+ src1 = __lsx_vld(src_argb1555, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpg = __lsx_vsrli_b(tmp0, 5);
+ reg0 = __lsx_vandi_b(tmp1, 0x03);
+ reg0 = __lsx_vslli_b(reg0, 3);
+ tmpg = __lsx_vor_v(tmpg, reg0);
+ reg1 = __lsx_vandi_b(tmp1, 0x7C);
+ tmpr = __lsx_vsrli_b(reg1, 2);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vslli_b(tmpg, 3);
+ reg2 = __lsx_vslli_b(tmpr, 3);
+ tmpb = __lsx_vsrli_b(tmpb, 2);
+ tmpg = __lsx_vsrli_b(tmpg, 2);
+ tmpr = __lsx_vsrli_b(tmpr, 2);
+ tmpb = __lsx_vor_v(reg0, tmpb);
+ tmpg = __lsx_vor_v(reg1, tmpg);
+ tmpr = __lsx_vor_v(reg2, tmpr);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lsx_vpackod_b(reg1, reg0);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_argb1555 += 32;
+ }
+}
+
+void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i reg0, reg1, reg2, reg3, dst0;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
+ next_argb1555, 16, src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ nexb = __lsx_vandi_b(tmp2, 0x1F);
+ tmpg = __lsx_vsrli_b(tmp0, 5);
+ nexg = __lsx_vsrli_b(tmp2, 5);
+ reg0 = __lsx_vandi_b(tmp1, 0x03);
+ reg2 = __lsx_vandi_b(tmp3, 0x03);
+ reg0 = __lsx_vslli_b(reg0, 3);
+ reg2 = __lsx_vslli_b(reg2, 3);
+ tmpg = __lsx_vor_v(tmpg, reg0);
+ nexg = __lsx_vor_v(nexg, reg2);
+ reg1 = __lsx_vandi_b(tmp1, 0x7C);
+ reg3 = __lsx_vandi_b(tmp3, 0x7C);
+ tmpr = __lsx_vsrli_b(reg1, 2);
+ nexr = __lsx_vsrli_b(reg3, 2);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vslli_b(tmpg, 3);
+ reg2 = __lsx_vslli_b(tmpr, 3);
+ tmpb = __lsx_vsrli_b(tmpb, 2);
+ tmpg = __lsx_vsrli_b(tmpg, 2);
+ tmpr = __lsx_vsrli_b(tmpr, 2);
+ tmpb = __lsx_vor_v(reg0, tmpb);
+ tmpg = __lsx_vor_v(reg1, tmpg);
+ tmpr = __lsx_vor_v(reg2, tmpr);
+ reg0 = __lsx_vslli_b(nexb, 3);
+ reg1 = __lsx_vslli_b(nexg, 3);
+ reg2 = __lsx_vslli_b(nexr, 3);
+ nexb = __lsx_vsrli_b(nexb, 2);
+ nexg = __lsx_vsrli_b(nexg, 2);
+ nexr = __lsx_vsrli_b(nexr, 2);
+ nexb = __lsx_vor_v(reg0, nexb);
+ nexg = __lsx_vor_v(reg1, nexg);
+ nexr = __lsx_vor_v(reg2, nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_argb1555 += 32;
+ next_argb1555 += 32;
+ }
+}
+
+void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m128i reg0, reg1, dst0;
+ __m128i const_66 = __lsx_vldi(66);
+ __m128i const_129 = __lsx_vldi(129);
+ __m128i const_25 = __lsx_vldi(25);
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb565, 0);
+ src1 = __lsx_vld(src_rgb565, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpr = __lsx_vandi_b(tmp1, 0xF8);
+ reg1 = __lsx_vandi_b(tmp1, 0x07);
+ reg0 = __lsx_vsrli_b(tmp0, 5);
+ reg1 = __lsx_vslli_b(reg1, 3);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vsrli_b(tmpb, 2);
+ tmpb = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpg, 2);
+ reg1 = __lsx_vsrli_b(tmpg, 4);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vsrli_b(tmpr, 5);
+ tmpr = __lsx_vor_v(tmpr, reg0);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lsx_vpackod_b(reg1, reg0);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_rgb565 += 32;
+ }
+}
+
+void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i reg0, reg1, reg2, reg3, dst0;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
+ next_rgb565, 16, src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpr = __lsx_vandi_b(tmp1, 0xF8);
+ nexb = __lsx_vandi_b(tmp2, 0x1F);
+ nexr = __lsx_vandi_b(tmp3, 0xF8);
+ reg1 = __lsx_vandi_b(tmp1, 0x07);
+ reg3 = __lsx_vandi_b(tmp3, 0x07);
+ reg0 = __lsx_vsrli_b(tmp0, 5);
+ reg1 = __lsx_vslli_b(reg1, 3);
+ reg2 = __lsx_vsrli_b(tmp2, 5);
+ reg3 = __lsx_vslli_b(reg3, 3);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ nexg = __lsx_vor_v(reg2, reg3);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vsrli_b(tmpb, 2);
+ reg2 = __lsx_vslli_b(nexb, 3);
+ reg3 = __lsx_vsrli_b(nexb, 2);
+ tmpb = __lsx_vor_v(reg1, reg0);
+ nexb = __lsx_vor_v(reg2, reg3);
+ reg0 = __lsx_vslli_b(tmpg, 2);
+ reg1 = __lsx_vsrli_b(tmpg, 4);
+ reg2 = __lsx_vslli_b(nexg, 2);
+ reg3 = __lsx_vsrli_b(nexg, 4);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ nexg = __lsx_vor_v(reg2, reg3);
+ reg0 = __lsx_vsrli_b(tmpr, 5);
+ reg2 = __lsx_vsrli_b(nexr, 5);
+ tmpr = __lsx_vor_v(tmpr, reg0);
+ nexr = __lsx_vor_v(nexr, reg2);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_rgb565 += 32;
+ next_rgb565 += 32;
+ }
+}
+
+void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i nex0, nex1, nex2, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+ __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
+ __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
+ __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
+ __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
+ __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A};
+ __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb24, 0);
+ src1 = __lsx_vld(src_rgb24, 16);
+ src2 = __lsx_vld(src_rgb24, 32);
+ nex0 = __lsx_vld(next_rgb24, 0);
+ nex1 = __lsx_vld(next_rgb24, 16);
+ nex2 = __lsx_vld(next_rgb24, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_rgb24 += 48;
+ next_rgb24 += 48;
+ }
+}
+
+void RAWToUVRow_LSX(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_raw = src_raw + src_stride_raw;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i nex0, nex1, nex2, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+ __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
+ __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
+ __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
+ __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
+ __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A};
+ __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_raw, 0);
+ src1 = __lsx_vld(src_raw, 16);
+ src2 = __lsx_vld(src_raw, 32);
+ nex0 = __lsx_vld(next_raw, 0);
+ nex1 = __lsx_vld(next_raw, 16);
+ nex2 = __lsx_vld(next_raw, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_raw += 48;
+ next_raw += 48;
+ }
+}
+
+void NV12ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_vu = __lsx_vld(src_uv, 0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 8;
+ src_uv += 8;
+ }
+}
+
+void NV12ToRGB565Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_vu = __lsx_vld(src_uv, 0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ out_b = __lsx_vsrli_h(out_b, 3);
+ out_g = __lsx_vsrli_h(out_g, 2);
+ out_r = __lsx_vsrli_h(out_r, 3);
+ out_g = __lsx_vslli_h(out_g, 5);
+ out_r = __lsx_vslli_h(out_r, 11);
+ out_r = __lsx_vor_v(out_r, out_g);
+ out_r = __lsx_vor_v(out_r, out_b);
+ __lsx_vst(out_r, dst_rgb565, 0);
+ src_y += 8;
+ src_uv += 8;
+ dst_rgb565 += 16;
+ }
+}
+
+void NV21ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i vec_y, vec_uv;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_uv = __lsx_vld(src_vu, 0);
+ YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g,
+ out_b);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 8;
+ src_vu += 8;
+ }
+}
+
+void SobelRow_LSX(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, tmp0;
+ __m128i out0, out1, out2, out3;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i shuff0 = {0x1001010110000000, 0x1003030310020202};
+ __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04);
+ __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04);
+ __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_sobelx, 0);
+ src1 = __lsx_vld(src_sobely, 0);
+ tmp0 = __lsx_vsadd_bu(src0, src1);
+ DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha,
+ tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3);
+ __lsx_vst(out0, dst_argb, 0);
+ __lsx_vst(out1, dst_argb, 16);
+ __lsx_vst(out2, dst_argb, 32);
+ __lsx_vst(out3, dst_argb, 48);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3);
+ dst0 = __lsx_vsadd_bu(src0, src2);
+ dst1 = __lsx_vsadd_bu(src1, src3);
+ __lsx_vst(dst0, dst_y, 0);
+ __lsx_vst(dst1, dst_y, 16);
+ src_sobelx += 32;
+ src_sobely += 32;
+ dst_y += 32;
+ }
+}
+
+void SobelXYRow_LSX(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src_r, src_b, src_g;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ src_r = __lsx_vld(src_sobelx, 0);
+ src_b = __lsx_vld(src_sobely, 0);
+ src_g = __lsx_vsadd_bu(src_r, src_b);
+ tmp0 = __lsx_vilvl_b(src_g, src_b);
+ tmp1 = __lsx_vilvh_b(src_g, src_b);
+ tmp2 = __lsx_vilvl_b(alpha, src_r);
+ tmp3 = __lsx_vilvh_b(alpha, src_r);
+ dst0 = __lsx_vilvl_h(tmp2, tmp0);
+ dst1 = __lsx_vilvh_h(tmp2, tmp0);
+ dst2 = __lsx_vilvl_h(tmp3, tmp1);
+ dst3 = __lsx_vilvh_h(tmp3, tmp1);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void BGRAToUVRow_LSX(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_bgra = src_bgra + src_stride_bgra;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickev_b(src1, src0);
+ tmp2 = __lsx_vpickod_b(src3, src2);
+ tmp3 = __lsx_vpickev_b(src3, src2);
+ tmpb = __lsx_vpickod_b(tmp2, tmp0);
+ tmpr = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickod_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickod_b(nex1, nex0);
+ tmp1 = __lsx_vpickev_b(nex1, nex0);
+ tmp2 = __lsx_vpickod_b(nex3, nex2);
+ tmp3 = __lsx_vpickev_b(nex3, nex2);
+ nexb = __lsx_vpickod_b(tmp2, tmp0);
+ nexr = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickod_b(tmp3, tmp1);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_bgra += 64;
+ next_bgra += 64;
+ }
+}
+
+void ABGRToUVRow_LSX(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_abgr = src_abgr + src_stride_abgr;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp2 = __lsx_vpickev_b(src3, src2);
+ tmp3 = __lsx_vpickod_b(src3, src2);
+ tmpb = __lsx_vpickod_b(tmp2, tmp0);
+ tmpr = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickev_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickev_b(nex1, nex0);
+ tmp1 = __lsx_vpickod_b(nex1, nex0);
+ tmp2 = __lsx_vpickev_b(nex3, nex2);
+ tmp3 = __lsx_vpickod_b(nex3, nex2);
+ nexb = __lsx_vpickod_b(tmp2, tmp0);
+ nexr = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickev_b(tmp3, tmp1);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_abgr += 64;
+ next_abgr += 64;
+ }
+}
+
+void RGBAToUVRow_LSX(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_rgba = src_rgba + src_stride_rgba;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickev_b(src1, src0);
+ tmp2 = __lsx_vpickod_b(src3, src2);
+ tmp3 = __lsx_vpickev_b(src3, src2);
+ tmpr = __lsx_vpickod_b(tmp2, tmp0);
+ tmpb = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickod_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickod_b(nex1, nex0);
+ tmp1 = __lsx_vpickev_b(nex1, nex0);
+ tmp2 = __lsx_vpickod_b(nex3, nex2);
+ tmp3 = __lsx_vpickev_b(nex3, nex2);
+ nexr = __lsx_vpickod_b(tmp2, tmp0);
+ nexb = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickod_b(tmp3, tmp1);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_rgba += 64;
+ next_rgba += 64;
+ }
+}
+
+void ARGBToUVJRow_LSX(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_argb = src_argb + src_stride_argb;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_63 = __lsx_vldi(0x43F);
+ __m128i const_42 = __lsx_vldi(0x42A);
+ __m128i const_21 = __lsx_vldi(0x415);
+ __m128i const_53 = __lsx_vldi(0x435);
+ __m128i const_10 = __lsx_vldi(0x40A);
+ __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp2 = __lsx_vpickev_b(src3, src2);
+ tmp3 = __lsx_vpickod_b(src3, src2);
+ tmpr = __lsx_vpickod_b(tmp2, tmp0);
+ tmpb = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickev_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickev_b(nex1, nex0);
+ tmp1 = __lsx_vpickod_b(nex1, nex0);
+ tmp2 = __lsx_vpickev_b(nex3, nex2);
+ tmp3 = __lsx_vpickod_b(nex3, nex2);
+ nexr = __lsx_vpickod_b(tmp2, tmp0);
+ nexb = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickev_b(tmp3, tmp1);
+ tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb);
+ tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb);
+ tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg);
+ tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
+ reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
+ reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
+ tmpb = __lsx_vavgr_hu(tmp0, tmp1);
+ tmpg = __lsx_vavgr_hu(tmp2, tmp3);
+ tmpr = __lsx_vavgr_hu(reg0, reg1);
+ reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
+ reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
+ reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
+ reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
+ reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
+ reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
+ dst0 = __lsx_vpickod_b(reg1, reg0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_argb += 64;
+ next_argb += 64;
+ }
+}
+
+void I444ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r;
+ __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_u = __lsx_vld(src_u, 0);
+ vec_v = __lsx_vld(src_v, 0);
+ vec_yl = __lsx_vilvl_b(vec_y, vec_y);
+ vec_ul = __lsx_vilvl_b(zero, vec_u);
+ vec_vl = __lsx_vilvl_b(zero, vec_v);
+ I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
+ out_b, out_g, out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ vec_yh = __lsx_vilvh_b(vec_y, vec_y);
+ vec_uh = __lsx_vilvh_b(zero, vec_u);
+ vec_vh = __lsx_vilvh_b(zero, vec_v);
+ I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
+ out_b, out_g, out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 16;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I400ToARGBRow_LSX(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_y, vec_yl, vec_yh, out0;
+ __m128i y_ev, y_od, dst0, dst1, dst2, dst3;
+ __m128i temp0, temp1;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]);
+ __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_yl = __lsx_vilvl_b(vec_y, vec_y);
+ y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg);
+ y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg);
+ y_ev = __lsx_vsrai_w(y_ev, 16);
+ y_od = __lsx_vsrai_w(y_od, 16);
+ y_ev = __lsx_vadd_w(y_ev, vec_yb);
+ y_od = __lsx_vadd_w(y_od, vec_yb);
+ y_ev = __lsx_vsrai_w(y_ev, 6);
+ y_od = __lsx_vsrai_w(y_od, 6);
+ y_ev = __lsx_vclip255_w(y_ev);
+ y_od = __lsx_vclip255_w(y_od);
+ out0 = __lsx_vpackev_h(y_od, y_ev);
+ temp0 = __lsx_vpackev_b(out0, out0);
+ temp1 = __lsx_vpackev_b(alpha, out0);
+ dst0 = __lsx_vilvl_h(temp1, temp0);
+ dst1 = __lsx_vilvh_h(temp1, temp0);
+ vec_yh = __lsx_vilvh_b(vec_y, vec_y);
+ y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg);
+ y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg);
+ y_ev = __lsx_vsrai_w(y_ev, 16);
+ y_od = __lsx_vsrai_w(y_od, 16);
+ y_ev = __lsx_vadd_w(y_ev, vec_yb);
+ y_od = __lsx_vadd_w(y_od, vec_yb);
+ y_ev = __lsx_vsrai_w(y_ev, 6);
+ y_od = __lsx_vsrai_w(y_od, 6);
+ y_ev = __lsx_vclip255_w(y_ev);
+ y_od = __lsx_vclip255_w(y_od);
+ out0 = __lsx_vpackev_h(y_od, y_ev);
+ temp0 = __lsx_vpackev_b(out0, out0);
+ temp1 = __lsx_vpackev_b(alpha, out0);
+ dst2 = __lsx_vilvl_h(temp1, temp0);
+ dst3 = __lsx_vilvh_h(temp1, temp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_y += 16;
+ }
+}
+
+void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_y, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ tmp0 = __lsx_vilvl_b(vec_y, vec_y);
+ tmp1 = __lsx_vilvh_b(vec_y, vec_y);
+ tmp2 = __lsx_vilvl_b(alpha, vec_y);
+ tmp3 = __lsx_vilvh_b(alpha, vec_y);
+ dst0 = __lsx_vilvl_h(tmp2, tmp0);
+ dst1 = __lsx_vilvh_h(tmp2, tmp0);
+ dst2 = __lsx_vilvl_h(tmp3, tmp1);
+ dst3 = __lsx_vilvh_h(tmp3, tmp1);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_y += 16;
+ }
+}
+
+void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i zero = __lsx_vldi(0);
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_yuy2, 0);
+ vec_y = __lsx_vpickev_b(src0, src0);
+ vec_vu = __lsx_vpickod_b(src0, src0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_yuy2 += 16;
+ }
+}
+
+void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i zero = __lsx_vldi(0);
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_uyvy, 0);
+ vec_y = __lsx_vpickod_b(src0, src0);
+ vec_vu = __lsx_vpickev_b(src0, src0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_uyvy += 16;
+ }
+}
+
+void InterpolateRow_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int32_t source_y_fraction) {
+ int x;
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* nex_ptr = src_ptr + src_stride;
+ uint16_t y_fractions;
+ int len = width / 32;
+ __m128i src0, src1, nex0, nex1;
+ __m128i dst0, dst1, y_frac;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i const_128 = __lsx_vldi(0x480);
+
+ if (y1_fraction == 0) {
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ __lsx_vst(src0, dst_ptr, 0);
+ __lsx_vst(src1, dst_ptr, 16);
+ src_ptr += 32;
+ dst_ptr += 32;
+ }
+ return;
+ }
+
+ if (y1_fraction == 128) {
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
+ dst0 = __lsx_vavgr_bu(src0, nex0);
+ dst1 = __lsx_vavgr_bu(src1, nex1);
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr, 16);
+ src_ptr += 32;
+ nex_ptr += 32;
+ dst_ptr += 32;
+ }
+ return;
+ }
+
+ y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
+ y_frac = __lsx_vreplgr2vr_h(y_fractions);
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
+ tmp0 = __lsx_vilvl_b(nex0, src0);
+ tmp1 = __lsx_vilvh_b(nex0, src0);
+ tmp2 = __lsx_vilvl_b(nex1, src1);
+ tmp3 = __lsx_vilvh_b(nex1, src1);
+ tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac);
+ tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac);
+ tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac);
+ tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac);
+ dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8);
+ dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8);
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr, 16);
+ src_ptr += 32;
+ nex_ptr += 32;
+ dst_ptr += 32;
+ }
+}
+
+void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) {
+ int x;
+ int len = width / 4;
+ __m128i dst0 = __lsx_vreplgr2vr_w(v32);
+
+ for (x = 0; x < len; x++) {
+ __lsx_vst(dst0, dst_argb, 0);
+ dst_argb += 16;
+ }
+}
+
+void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i dst0, dst1, dst2;
+ __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06};
+ __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A};
+ __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1);
+ src2 = __lsx_vld(src_raw, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1);
+ dst2 = __lsx_vshuf_b(src1, src2, shuf2);
+ dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E);
+ __lsx_vst(dst0, dst_rgb24, 0);
+ __lsx_vst(dst1, dst_rgb24, 16);
+ __lsx_vst(dst2, dst_rgb24, 32);
+ dst_rgb24 += 48;
+ src_raw += 48;
+ }
+}
+
+void MergeUVRow_LSX(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1);
+ dst0 = __lsx_vilvl_b(src1, src0);
+ dst1 = __lsx_vilvh_b(src1, src0);
+ __lsx_vst(dst0, dst_uv, 0);
+ __lsx_vst(dst1, dst_uv, 16);
+ src_u += 16;
+ src_v += 16;
+ dst_uv += 32;
+ }
+}
+
+void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src3, src2);
+ dst0 = __lsx_vpickod_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst_a, 0);
+ src_argb += 64;
+ dst_a += 16;
+ }
+}
+
+void ARGBBlendRow_LSX(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, dst0, dst1;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i a0, a1, a2, a3;
+ __m128i const_256 = __lsx_vldi(0x500);
+ __m128i zero = __lsx_vldi(0);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vshuf4i_b(src0, 0xFF);
+ tmp1 = __lsx_vshuf4i_b(src1, 0xFF);
+ a0 = __lsx_vilvl_b(zero, tmp0);
+ a1 = __lsx_vilvh_b(zero, tmp0);
+ a2 = __lsx_vilvl_b(zero, tmp1);
+ a3 = __lsx_vilvh_b(zero, tmp1);
+ reg0 = __lsx_vilvl_b(zero, src2);
+ reg1 = __lsx_vilvh_b(zero, src2);
+ reg2 = __lsx_vilvl_b(zero, src3);
+ reg3 = __lsx_vilvh_b(zero, src3);
+ DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2,
+ const_256, a3, a0, a1, a2, a3);
+ DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1,
+ reg2, reg3);
+ DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1);
+ dst0 = __lsx_vsadd_bu(dst0, src0);
+ dst1 = __lsx_vsadd_bu(dst1, src1);
+ dst0 = __lsx_vbitsel_v(dst0, alpha, control);
+ dst1 = __lsx_vbitsel_v(dst1, alpha, control);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ src_argb += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i vec_size = __lsx_vreplgr2vr_b(interval_size);
+ __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
+ __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
+ __m128i zero = __lsx_vldi(0);
+ __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
+ src0, src1, src2, src3);
+ reg0 = __lsx_vilvl_b(zero, src0);
+ reg1 = __lsx_vilvh_b(zero, src0);
+ reg2 = __lsx_vilvl_b(zero, src1);
+ reg3 = __lsx_vilvh_b(zero, src1);
+ reg4 = __lsx_vilvl_b(zero, src2);
+ reg5 = __lsx_vilvh_b(zero, src2);
+ reg6 = __lsx_vilvl_b(zero, src3);
+ reg7 = __lsx_vilvh_b(zero, src3);
+ tmp0 = __lsx_vilvl_h(zero, reg0);
+ tmp1 = __lsx_vilvh_h(zero, reg0);
+ tmp2 = __lsx_vilvl_h(zero, reg1);
+ tmp3 = __lsx_vilvh_h(zero, reg1);
+ tmp4 = __lsx_vilvl_h(zero, reg2);
+ tmp5 = __lsx_vilvh_h(zero, reg2);
+ tmp6 = __lsx_vilvl_h(zero, reg3);
+ tmp7 = __lsx_vilvh_h(zero, reg3);
+ DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
+ tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
+ tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
+ tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
+ dst0 = __lsx_vpickev_b(reg1, reg0);
+ dst1 = __lsx_vpickev_b(reg3, reg2);
+ tmp0 = __lsx_vilvl_h(zero, reg4);
+ tmp1 = __lsx_vilvh_h(zero, reg4);
+ tmp2 = __lsx_vilvl_h(zero, reg5);
+ tmp3 = __lsx_vilvh_h(zero, reg5);
+ tmp4 = __lsx_vilvl_h(zero, reg6);
+ tmp5 = __lsx_vilvh_h(zero, reg6);
+ tmp6 = __lsx_vilvl_h(zero, reg7);
+ tmp7 = __lsx_vilvh_h(zero, reg7);
+ DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
+ tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
+ tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
+ tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
+ dst2 = __lsx_vpickev_b(reg1, reg0);
+ dst3 = __lsx_vpickev_b(reg3, reg2);
+ DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size,
+ dst3, vec_size, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2,
+ vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2,
+ src2, control, dst3, src3, control, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ }
+}
+
+void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1, dst0, dst1;
+ __m128i tmp_b, tmp_g, tmp_r, tmp_a;
+ __m128i reg_b, reg_g, reg_r, reg_a;
+ __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0);
+ __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4);
+ __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8);
+ __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12);
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r,
+ src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r,
+ src1, matrix_a, reg_b, reg_g, reg_r, reg_a);
+ DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a,
+ tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
+ DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a,
+ reg_a, reg_b, reg_g, reg_r, reg_a);
+ DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b,
+ tmp_g, tmp_r, tmp_a);
+ DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b,
+ reg_g, reg_r, reg_a);
+ DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r,
+ tmp_a)
+ DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r,
+ reg_a)
+ DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a,
+ tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
+ tmp0 = __lsx_vpackev_b(tmp_g, tmp_b);
+ tmp1 = __lsx_vpackev_b(tmp_a, tmp_r);
+ dst0 = __lsx_vilvl_h(tmp1, tmp0);
+ dst1 = __lsx_vilvh_h(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void SplitUVRow_LSX(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3);
+ __lsx_vst(dst0, dst_u, 0);
+ __lsx_vst(dst1, dst_u, 16);
+ __lsx_vst(dst2, dst_v, 0);
+ __lsx_vst(dst3, dst_v, 16);
+ src_uv += 64;
+ dst_u += 32;
+ dst_v += 32;
+ }
+}
+
+void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) {
+ int x;
+ int len = width / 16;
+ __m128i dst0 = __lsx_vreplgr2vr_b(v8);
+
+ for (x = 0; x < len; x++) {
+ __lsx_vst(dst0, dst, 0);
+ dst += 16;
+ }
+}
+
+void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E};
+ __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F};
+
+ src_uv += (width << 1);
+ for (x = 0; x < len; x++) {
+ src_uv -= 64;
+ DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2,
+ src3, src0, src1);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0,
+ shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_v, 0);
+ __lsx_vst(dst1, dst_v, 16);
+ __lsx_vst(dst2, dst_u, 0);
+ __lsx_vst(dst3, dst_u, 16);
+ dst_u += 32;
+ dst_v += 32;
+ }
+}
+
+void HalfFloatRow_LSX(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ int x;
+ int len = width / 32;
+ float mult = 1.9259299444e-34f * scale;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0);
+ __m128i zero = __lsx_vldi(0);
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4,
+ reg6);
+ DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5,
+ reg7);
+ DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult,
+ reg3, vec_mult, reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult,
+ reg7, vec_mult, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13,
+ (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13,
+ (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+ dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ src += 32;
+ dst += 32;
+ }
+}
+
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+ uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+ 128,
+ 0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080,
+ 0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+ 0x1080,
+ 0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
+ "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
+ "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
+ "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
+ "1: \n\t"
+ "vld $vr4, %0, 0 \n\t"
+ "vld $vr5, %0, 16 \n\t"
+ "vld $vr6, %0, 32 \n\t"
+ "vld $vr7, %0, 48 \n\t" // load 16 pixels of
+ // ARGB
+ "vor.v $vr12, $vr3, $vr3 \n\t"
+ "vor.v $vr13, $vr3, $vr3 \n\t"
+ "addi.d %2, %2, -16 \n\t" // 16 processed per
+ // loop.
+ "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR
+ "vpickev.b $vr10, $vr7, $vr6 \n\t"
+ "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA
+ "vpickod.b $vr11, $vr7, $vr6 \n\t"
+ "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B
+ "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t"
+ "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G
+ "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t"
+ "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R
+ "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t"
+ "addi.d %0, %0, 64 \n\t"
+ "vpickod.b $vr10, $vr13, $vr12 \n\t"
+ "vst $vr10, %1, 0 \n\t"
+ "addi.d %1, %1, 16 \n\t"
+ "bnez %2, 1b \n\t"
+ : "+&r"(src_argb), // %0
+ "+&r"(dst_y), // %1
+ "+&r"(width) // %2
+ : "r"(rgbconstants)
+ : "memory");
+}
+
+void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
+ "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
+ "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
+ "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
+ "1: \n\t"
+ "vld $vr4, %0, 0 \n\t"
+ "vld $vr5, %0, 16 \n\t"
+ "vld $vr6, %0, 32 \n\t"
+ "vld $vr7, %0, 48 \n\t" // load 16 pixels of
+ // RGBA
+ "vor.v $vr12, $vr3, $vr3 \n\t"
+ "vor.v $vr13, $vr3, $vr3 \n\t"
+ "addi.d %2, %2, -16 \n\t" // 16 processed per
+ // loop.
+ "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG
+ "vpickev.b $vr10, $vr7, $vr6 \n\t"
+ "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR
+ "vpickod.b $vr11, $vr7, $vr6 \n\t"
+ "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B
+ "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t"
+ "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G
+ "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t"
+ "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R
+ "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t"
+ "addi.d %0, %0, 64 \n\t"
+ "vpickod.b $vr10, $vr13, $vr12 \n\t"
+ "vst $vr10, %1, 0 \n\t"
+ "addi.d %1, %1, 16 \n\t"
+ "bnez %2, 1b \n\t"
+ : "+&r"(src_rgba), // %0
+ "+&r"(dst_y), // %1
+ "+&r"(width) // %2
+ : "r"(rgbconstants)
+ : "memory");
+}
+
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
+ 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6,
+ 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
+ 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
+ 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
+ asm volatile(
+ "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
+ "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
+ "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
+ "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
+ "vld $vr4, %4, 0 \n\t" // load shuff
+ "vld $vr5, %4, 16 \n\t"
+ "vld $vr6, %4, 32 \n\t"
+ "vld $vr7, %4, 48 \n\t"
+ "1: \n\t"
+ "vld $vr8, %0, 0 \n\t"
+ "vld $vr9, %0, 16 \n\t"
+ "vld $vr10, %0, 32 \n\t" // load 16 pixels of
+ // RGB
+ "vor.v $vr12, $vr3, $vr3 \n\t"
+ "vor.v $vr13, $vr3, $vr3 \n\t"
+ "addi.d %2, %2, -16 \n\t" // 16 processed per
+ // loop.
+ "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t"
+ "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t"
+ "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t"
+ "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t"
+ "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G
+ "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t"
+ "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B
+ "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t"
+ "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R
+ "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t"
+ "addi.d %0, %0, 48 \n\t"
+ "vpickod.b $vr10, $vr13, $vr12 \n\t"
+ "vst $vr10, %1, 0 \n\t"
+ "addi.d %1, %1, 16 \n\t"
+ "bnez %2, 1b \n\t"
+ : "+&r"(src_rgba), // %0
+ "+&r"(dst_y), // %1
+ "+&r"(width) // %2
+ : "r"(rgbconstants), // %3
+ "r"(shuff) // %4
+ : "memory");
+}
+
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/row_msa.cc b/source/row_msa.cc
index 5c0239a3..b7d5bb5e 100644
--- a/files/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -24,16 +24,14 @@ extern "C" {
#define ALPHA_VAL (-1)
// Fill YUV -> RGB conversion constants into vectors
-#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
- { \
- ub = __msa_fill_w(yuvconst->kUVToB[0]); \
- vr = __msa_fill_w(yuvconst->kUVToR[1]); \
- ug = __msa_fill_w(yuvconst->kUVToG[0]); \
- vg = __msa_fill_w(yuvconst->kUVToG[1]); \
- bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
- bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
- br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
- yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
+ { \
+ ub = __msa_fill_w(yuvconst->kUVToB[0]); \
+ vr = __msa_fill_w(yuvconst->kUVToR[1]); \
+ ug = __msa_fill_w(yuvconst->kUVToG[0]); \
+ vg = __msa_fill_w(yuvconst->kUVToG[1]); \
+ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
+ yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
}
// Load YUV 422 pixel data
@@ -70,54 +68,52 @@ extern "C" {
}
// Convert 8 pixels of YUV 420 to RGB.
-#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
- { \
- v8i16 vec0_m, vec1_m; \
- v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
- v4i32 reg5_m, reg6_m, reg7_m; \
- v16i8 zero_m = {0}; \
- \
- vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
- vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
- reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
- reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
- reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
- reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
- reg0_m *= yg; \
- reg1_m *= yg; \
- reg2_m *= ubvr; \
- reg3_m *= ubvr; \
- reg0_m = __msa_srai_w(reg0_m, 16); \
- reg1_m = __msa_srai_w(reg1_m, 16); \
- reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
- reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
- reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
- reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
- reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
- reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
- reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
- reg5_m = reg0_m - reg5_m; \
- reg6_m = reg1_m - reg6_m; \
- reg2_m = reg0_m - reg2_m; \
- reg3_m = reg1_m - reg3_m; \
- reg7_m = reg0_m - reg7_m; \
- reg4_m = reg1_m - reg4_m; \
- reg5_m += bb; \
- reg6_m += bb; \
- reg7_m += bg; \
- reg4_m += bg; \
- reg2_m += br; \
- reg3_m += br; \
- reg5_m = __msa_srai_w(reg5_m, 6); \
- reg6_m = __msa_srai_w(reg6_m, 6); \
- reg7_m = __msa_srai_w(reg7_m, 6); \
- reg4_m = __msa_srai_w(reg4_m, 6); \
- reg2_m = __msa_srai_w(reg2_m, 6); \
- reg3_m = __msa_srai_w(reg3_m, 6); \
- CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
- out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
- out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
- out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
+ { \
+ v8i16 vec0_m, vec1_m; \
+ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
+ v4i32 reg5_m, reg6_m, reg7_m; \
+ v16i8 temp_m, zero_m = {0}; \
+ \
+ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
+ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
+ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
+ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
+ vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80); \
+ temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0); \
+ reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m); \
+ reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m); \
+ reg0_m *= yg; \
+ reg1_m *= yg; \
+ reg2_m *= ubvr; \
+ reg3_m *= ubvr; \
+ reg0_m = __msa_srai_w(reg0_m, 16); \
+ reg1_m = __msa_srai_w(reg1_m, 16); \
+ reg0_m += yb; \
+ reg1_m += yb; \
+ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
+ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
+ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
+ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
+ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
+ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
+ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
+ reg5_m = reg0_m + reg5_m; \
+ reg6_m = reg1_m + reg6_m; \
+ reg2_m = reg0_m + reg2_m; \
+ reg3_m = reg1_m + reg3_m; \
+ reg7_m = reg0_m - reg7_m; \
+ reg4_m = reg1_m - reg4_m; \
+ reg5_m = __msa_srai_w(reg5_m, 6); \
+ reg6_m = __msa_srai_w(reg6_m, 6); \
+ reg7_m = __msa_srai_w(reg7_m, 6); \
+ reg4_m = __msa_srai_w(reg4_m, 6); \
+ reg2_m = __msa_srai_w(reg2_m, 6); \
+ reg3_m = __msa_srai_w(reg3_m, 6); \
+ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
+ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
+ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
+ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
}
// Pack and Store 8 ARGB values.
@@ -155,11 +151,10 @@ extern "C" {
}
// Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
{ \
v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v16u8 vec8_m, vec9_m; \
v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
v8u16 reg8_m, reg9_m; \
\
@@ -195,81 +190,81 @@ extern "C" {
reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
- src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
- src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
- src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
- src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
- src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
- src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
- src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
- src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
- vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
- vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
- vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
- vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
- vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
- vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
- vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
- vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
- reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
- reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
- reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
- reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
- reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
- reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
- reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
- reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
- reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ reg8_m += const_0x0101; \
+ reg9_m += const_0x0101; \
+ reg0_m += const_0x0101; \
+ reg1_m += const_0x0101; \
+ argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \
+ argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \
+ argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \
+ argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \
}
-// Takes ARGB input and calculates U and V.
#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
- shf0, shf1, shf2, shf3, v_out, u_out) \
+ shf0, shf1, shf2, shf3, shift, u_out, v_out) \
{ \
- v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
\
- vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
- vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
- vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
- vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
- vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
- vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
- vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
- vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
- reg0_m = __msa_dotp_u_h(vec0_m, const1); \
- reg1_m = __msa_dotp_u_h(vec1_m, const1); \
- reg2_m = __msa_dotp_u_h(vec4_m, const1); \
- reg3_m = __msa_dotp_u_h(vec5_m, const1); \
- reg0_m += const3; \
- reg1_m += const3; \
- reg2_m += const3; \
- reg3_m += const3; \
- reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
- reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
- reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
- reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
- v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
- u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
+ vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const0); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const0); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const0); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const0); \
+ reg0_m += const1; \
+ reg1_m += const1; \
+ reg2_m += const1; \
+ reg3_m += const1; \
+ reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \
+ reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \
+ reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \
+ reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \
+ reg0_m = __msa_srl_w(reg0_m, shift); \
+ reg1_m = __msa_srl_w(reg1_m, shift); \
+ reg2_m = __msa_srl_w(reg2_m, shift); \
+ reg3_m = __msa_srl_w(reg3_m, shift); \
+ u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, v_out, u_out) \
+ { \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const1); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const1); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const1); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const1); \
+ reg0_m += (v4u32)const3; \
+ reg1_m += (v4u32)const3; \
+ reg2_m += (v4u32)const3; \
+ reg3_m += (v4u32)const3; \
+ reg0_m -= __msa_dotp_u_w(vec2_m, const0); \
+ reg1_m -= __msa_dotp_u_w(vec3_m, const0); \
+ reg2_m -= __msa_dotp_u_w(vec6_m, const2); \
+ reg3_m -= __msa_dotp_u_w(vec7_m, const2); \
+ u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \
+ v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \
}
// Load I444 pixel data
@@ -285,6 +280,34 @@ extern "C" {
out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
}
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+ { \
+ v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
+ v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
+ _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
+ _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
+ _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
+ _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
+ _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
+ _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
+ _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
+ _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
+ _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
+ _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
+ _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
+ _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
+ _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
+ _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
+ _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
+ _reg1 = const_8080 + const_112 * _reg0; \
+ _reg3 = const_8080 + const_112 * _reg4; \
+ _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
+ _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
+ _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
+ _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
+ _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
+ }
+
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -302,6 +325,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ v8u16 src, dst;
+ v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
+ src_uv += (width - 8) << 1;
+ for (x = 0; x < width; x += 8) {
+ src = LD_UH(src_uv);
+ dst = __msa_vshf_h(shuffler, src, src);
+ ST_UH(dst, dst_uv);
+ src_uv -= 16;
+ dst_uv += 16;
+ }
+}
+
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -376,20 +413,19 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_y += 8;
src_u += 4;
@@ -407,20 +443,19 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
src_y += 8;
src_u += 4;
@@ -440,12 +475,12 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
int64_t data_a;
v16u8 src0, src1, src2, src3;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v4i32 zero = {0};
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -454,8 +489,7 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
STOREARGB(vec0, vec1, vec2, src3, dst_argb);
src_y += 8;
@@ -476,17 +510,17 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
int64_t data_u, data_v;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 reg0, reg1, reg2, reg3;
v2i64 zero = {0};
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
11, 29, 12, 13, 30, 14, 15, 31};
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -499,10 +533,8 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
- YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec3, vec4, vec5);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5);
reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
@@ -529,24 +561,23 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec2, vec1);
- vec0 = __msa_srai_h(vec0, 3);
- vec1 = __msa_srai_h(vec1, 3);
- vec2 = __msa_srai_h(vec2, 2);
- vec1 = __msa_slli_h(vec1, 11);
- vec2 = __msa_slli_h(vec2, 5);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ vec0 = __msa_srli_h(vec0, 3);
+ vec1 = __msa_srli_h(vec1, 2);
+ vec2 = __msa_srli_h(vec2, 3);
+ vec2 = __msa_slli_h(vec2, 11);
+ vec1 = __msa_slli_h(vec1, 5);
vec0 |= vec1;
dst0 = (v16u8)(vec2 | vec0);
ST_UB(dst0, dst_rgb565);
@@ -568,25 +599,24 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+ v8u16 mask = (v8u16)__msa_fill_h(0x00F0);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
- reg0 = (v8u16)__msa_srai_h(vec0, 4);
- reg1 = (v8u16)__msa_srai_h(vec1, 4);
- reg2 = (v8u16)__msa_srai_h(vec2, 4);
- reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
- reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srli_h(vec0, 4);
+ reg2 = (v8u16)__msa_srli_h(vec2, 4);
+ reg1 = (v8u16)__msa_and_v(vec1, mask);
+ reg2 = (v8u16)__msa_slli_h(reg2, 8);
reg1 |= const_0xF000;
reg0 |= reg2;
dst0 = (v16u8)(reg1 | reg0);
@@ -608,23 +638,22 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
- reg0 = (v8u16)__msa_srai_h(vec0, 3);
- reg1 = (v8u16)__msa_srai_h(vec1, 3);
- reg2 = (v8u16)__msa_srai_h(vec2, 3);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srli_h(vec0, 3);
+ reg1 = (v8u16)__msa_srli_h(vec1, 3);
+ reg2 = (v8u16)__msa_srli_h(vec2, 3);
reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
reg1 |= const_0x8000;
@@ -768,7 +797,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
}
}
-void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -779,10 +808,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
@@ -809,38 +838,39 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+ const uint8_t* src_argb_next = src_argb + src_stride_argb;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v16u8 dst0, dst1;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
- src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
- src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
- src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
- src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -861,14 +891,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 = __msa_hadd_u_h(vec5, vec5);
reg4 = __msa_hadd_u_h(vec0, vec0);
reg5 = __msa_hadd_u_h(vec1, vec1);
- src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
- src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
- src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
- src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
- src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
- src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
- src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
- src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -889,12 +919,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 += __msa_hadd_u_h(vec5, vec5);
reg4 += __msa_hadd_u_h(vec0, vec0);
reg5 += __msa_hadd_u_h(vec1, vec1);
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
- reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
- reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
- reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
- reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg4 += const_0x0001;
+ reg5 += const_0x0001;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
+ reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
reg6 = reg0 * const_0x70;
reg7 = reg1 * const_0x70;
reg8 = reg2 * const_0x4A;
@@ -925,8 +961,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
- src_argb0 += 128;
- src_argb0_next += 128;
+ src_argb += 128;
+ src_argb_next += 128;
dst_u += 16;
dst_v += 16;
}
@@ -1153,7 +1189,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
}
}
-void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -1164,7 +1200,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
v8i16 zero = {0};
for (x = 0; x < width; x += 4) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
@@ -1186,13 +1222,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_argb);
- src_argb0 += 16;
+ src_argb += 16;
src_argb1 += 16;
dst_argb += 16;
}
}
-void ARGBAddRow_MSA(const uint8_t* src_argb0,
+void ARGBAddRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -1200,20 +1236,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0,
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
- src_argb0 += 32;
+ src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
-void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+void ARGBSubtractRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -1221,14 +1257,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_subs_u_b(src0, src2);
dst1 = __msa_subs_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
- src_argb0 += 32;
+ src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
@@ -1412,17 +1448,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0, dst1;
v8u16 reg0;
- v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+ v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
- reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
- reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
- reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+ reg0 = __msa_dotp_u_h(vec0, const_0x961D);
+ reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
@@ -1656,56 +1692,51 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
int x;
- v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
- v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
- v16u8 dst0;
- v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
- v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
- v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
- v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+ v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
+ v16u8 reg0, reg1, reg2, dst;
+ v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
+ v8i16 res0, res1;
+ v8i16 const_66 = (v8i16)__msa_ldi_h(66);
+ v8i16 const_129 = (v8i16)__msa_ldi_h(129);
+ v8i16 const_25 = (v8i16)__msa_ldi_h(25);
+ v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
+ v16u8 zero = (v16u8)__msa_ldi_b(0);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
- src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src1 & const_0x1F;
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- vec2 = src0 & const_0x1F;
- vec3 = src1 & const_0x1F;
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- vec4 = src0 & const_0x1F;
- vec5 = src1 & const_0x1F;
- reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
- reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
- reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
- reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
- reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
- reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
- reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
- reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
- reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
- reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
- reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
- reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
- reg0 *= const_0x19;
- reg1 *= const_0x19;
- reg2 *= const_0x81;
- reg3 *= const_0x81;
- reg4 *= const_0x42;
- reg5 *= const_0x42;
- reg0 += reg2;
- reg1 += reg3;
- reg0 += reg4;
- reg1 += reg5;
- reg0 += const_0x1080;
- reg1 += const_0x1080;
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
- reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
- dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
- ST_UB(dst0, dst_y);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ tmpg = (v16u8)__msa_srli_b(tmp0, 5);
+ reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
+ reg0 = (v16u8)__msa_slli_b(reg0, 3);
+ tmpg = (v16u8)__msa_or_v(tmpg, reg0);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
+ tmpr = (v16u8)__msa_srli_b(reg1, 2);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_slli_b(tmpg, 3);
+ reg2 = (v16u8)__msa_slli_b(tmpr, 3);
+ tmpb = (v16u8)__msa_srli_b(tmpb, 2);
+ tmpg = (v16u8)__msa_srli_b(tmpg, 2);
+ tmpr = (v16u8)__msa_srli_b(tmpr, 2);
+ tmpb = (v16u8)__msa_or_v(reg0, tmpb);
+ tmpg = (v16u8)__msa_or_v(reg1, tmpg);
+ tmpr = (v16u8)__msa_or_v(reg2, tmpr);
+ tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
+ tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
+ tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
+ tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
+ tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
+ tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
+ res0 = const_1080 + const_25 * tmpb_r;
+ res1 = const_1080 + const_25 * tmpb_l;
+ res0 += const_129 * tmpg_r;
+ res1 += const_129 * tmpg_l;
+ res0 += const_66 * tmpr_r;
+ res1 += const_66 * tmpr_l;
+ dst = (v16u8)__msa_pckod_b(res1, res0);
+ ST_UB(dst, dst_y);
src_argb1555 += 32;
dst_y += 16;
}
@@ -1713,68 +1744,55 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
int x;
- v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
- v4u32 res0, res1, res2, res3;
- v16u8 dst0;
- v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
- v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
- v8i16 const_0x1080 = __msa_fill_h(0x1080);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
- v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
- v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+ v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
+ v16u8 reg0, reg1, dst;
+ v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
+ v8i16 res0, res1;
+ v8i16 const_66 = (v8i16)__msa_ldi_h(66);
+ v8i16 const_129 = (v8i16)__msa_ldi_h(129);
+ v8i16 const_25 = (v8i16)__msa_ldi_h(25);
+ v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
+ v16u8 zero = __msa_ldi_b(0);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
- src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src0 & const_0x7E0;
- vec2 = src0 & const_0xF800;
- vec3 = src1 & const_0x1F;
- vec4 = src1 & const_0x7E0;
- vec5 = src1 & const_0xF800;
- reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
- reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
- reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
- reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
- reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
- reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
- reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
- reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
- reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
- reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
- reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
- reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
- vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
- vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
- vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
- vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
- vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
- vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
- vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
- vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
- res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
- res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
- res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
- res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
- res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
- res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
- res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
- res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
- res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
- res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
- res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
- res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
- vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
- vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
- dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
- ST_UB(dst0, dst_y);
+ src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
+ reg0 = (v16u8)__msa_srli_b(tmp0, 5);
+ reg1 = (v16u8)__msa_slli_b(reg1, 3);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_srli_b(tmpb, 2);
+ tmpb = (v16u8)__msa_or_v(reg1, reg0);
+ reg0 = (v16u8)__msa_slli_b(tmpg, 2);
+ reg1 = (v16u8)__msa_srli_b(tmpg, 4);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ reg0 = (v16u8)__msa_srli_b(tmpr, 5);
+ tmpr = (v16u8)__msa_or_v(tmpr, reg0);
+ tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
+ tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
+ tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
+ tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
+ tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
+ tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
+ res0 = const_1080 + const_25 * tmpb_r;
+ res1 = const_1080 + const_25 * tmpb_l;
+ res0 += const_129 * tmpg_r;
+ res1 += const_129 * tmpg_l;
+ res0 += const_66 * tmpr_r;
+ res1 += const_66 * tmpr_l;
+ dst = (v16u8)__msa_pckod_b(res1, res0);
+ ST_UB(dst, dst_y);
src_rgb565 += 32;
dst_y += 16;
}
}
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@@ -1789,9 +1807,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1810,12 +1828,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
- src_argb0 += 48;
+ src_argb += 48;
dst_y += 16;
}
}
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@@ -1830,9 +1848,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1851,7 +1869,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
- src_argb0 += 48;
+ src_argb += 48;
dst_y += 16;
}
}
@@ -1865,69 +1883,61 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
const uint16_t* s = (const uint16_t*)src_argb1555;
const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
int64_t res0, res1;
- v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
- v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v16u8 src0, src1, src2, src3, dst;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 reg0, reg1, reg2, reg3;
+ v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
+ v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
+ v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
+ v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
+ v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
+ v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
src0 = (v8u16)__msa_ld_b((void*)s, 0);
src1 = (v8u16)__msa_ld_b((void*)s, 16);
src2 = (v8u16)__msa_ld_b((void*)t, 0);
src3 = (v8u16)__msa_ld_b((void*)t, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src1 & const_0x1F;
- vec0 += src2 & const_0x1F;
- vec1 += src3 & const_0x1F;
- vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
- vec2 = src0 & const_0x1F;
- vec3 = src1 & const_0x1F;
- vec2 += src2 & const_0x1F;
- vec3 += src3 & const_0x1F;
- vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
- vec4 = src0 & const_0x1F;
- vec5 = src1 & const_0x1F;
- vec4 += src2 & const_0x1F;
- vec5 += src3 & const_0x1F;
- vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
- vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
- vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
- vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
- vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
- vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
- vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
- vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
- vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
- vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
- reg0 = vec6 * const_0x70;
- reg1 = vec0 * const_0x4A;
- reg2 = vec2 * const_0x70;
- reg3 = vec0 * const_0x5E;
- reg0 += const_0x8080;
- reg1 += vec2 * const_0x26;
- reg2 += const_0x8080;
- reg3 += vec6 * const_0x12;
- reg0 -= reg1;
- reg2 -= reg3;
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
- dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
- res0 = __msa_copy_u_d((v2i64)dst0, 0);
- res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmp2 = (v16u8)__msa_pckev_b(src3, src2);
+ tmp3 = (v16u8)__msa_pckod_b(src3, src2);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
+ tmpg = (v16u8)__msa_srli_b(tmp0, 5);
+ nexg = (v16u8)__msa_srli_b(tmp2, 5);
+ reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
+ reg2 = (v16u8)__msa_andi_b(tmp3, 0x03);
+ reg0 = (v16u8)__msa_slli_b(reg0, 3);
+ reg2 = (v16u8)__msa_slli_b(reg2, 3);
+ tmpg = (v16u8)__msa_or_v(tmpg, reg0);
+ nexg = (v16u8)__msa_or_v(nexg, reg2);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
+ reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C);
+ tmpr = (v16u8)__msa_srli_b(reg1, 2);
+ nexr = (v16u8)__msa_srli_b(reg3, 2);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_slli_b(tmpg, 3);
+ reg2 = (v16u8)__msa_slli_b(tmpr, 3);
+ tmpb = (v16u8)__msa_srli_b(tmpb, 2);
+ tmpg = (v16u8)__msa_srli_b(tmpg, 2);
+ tmpr = (v16u8)__msa_srli_b(tmpr, 2);
+ tmpb = (v16u8)__msa_or_v(reg0, tmpb);
+ tmpg = (v16u8)__msa_or_v(reg1, tmpg);
+ tmpr = (v16u8)__msa_or_v(reg2, tmpr);
+ reg0 = (v16u8)__msa_slli_b(nexb, 3);
+ reg1 = (v16u8)__msa_slli_b(nexg, 3);
+ reg2 = (v16u8)__msa_slli_b(nexr, 3);
+ nexb = (v16u8)__msa_srli_b(nexb, 2);
+ nexg = (v16u8)__msa_srli_b(nexg, 2);
+ nexr = (v16u8)__msa_srli_b(nexr, 2);
+ nexb = (v16u8)__msa_or_v(reg0, nexb);
+ nexg = (v16u8)__msa_or_v(reg1, nexg);
+ nexr = (v16u8)__msa_or_v(reg2, nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
+ res0 = __msa_copy_u_d((v2i64)dst, 0);
+ res1 = __msa_copy_u_d((v2i64)dst, 1);
SD(res0, dst_u);
SD(res1, dst_v);
s += 16;
@@ -1946,68 +1956,57 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
const uint16_t* s = (const uint16_t*)src_rgb565;
const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
int64_t res0, res1;
- v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
- v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
- v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
- v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+ v16u8 src0, src1, src2, src3, dst;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 reg0, reg1, reg2, reg3;
+ v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
+ v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
+ v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
+ v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
+ v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
+ v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((void*)s, 0);
- src1 = (v8u16)__msa_ld_b((void*)s, 16);
- src2 = (v8u16)__msa_ld_b((void*)t, 0);
- src3 = (v8u16)__msa_ld_b((void*)t, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src1 & const_0x1F;
- vec0 += src2 & const_0x1F;
- vec1 += src3 & const_0x1F;
- vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
- vec2 = src0 & const_0x3F;
- vec3 = src1 & const_0x3F;
- vec2 += src2 & const_0x3F;
- vec3 += src3 & const_0x3F;
- vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
- vec4 = src0 & const_0x1F;
- vec5 = src1 & const_0x1F;
- vec4 += src2 & const_0x1F;
- vec5 += src3 & const_0x1F;
- vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
- vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
- vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
- vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
- vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
- vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
- vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
- vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
- reg0 = vec3 * const_0x70;
- reg1 = vec1 * const_0x4A;
- reg2 = vec4 * const_0x70;
- reg3 = vec1 * const_0x5E;
- reg0 += const_32896;
- reg1 += vec4 * const_0x26;
- reg2 += const_32896;
- reg3 += vec3 * const_0x12;
- reg0 -= reg1;
- reg2 -= reg3;
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
- dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
- res0 = __msa_copy_u_d((v2i64)dst0, 0);
- res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ src0 = (v16u8)__msa_ld_b((void*)s, 0);
+ src1 = (v16u8)__msa_ld_b((void*)s, 16);
+ src2 = (v16u8)__msa_ld_b((void*)t, 0);
+ src3 = (v16u8)__msa_ld_b((void*)t, 16);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmp2 = (v16u8)__msa_pckev_b(src3, src2);
+ tmp3 = (v16u8)__msa_pckod_b(src3, src2);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
+ nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
+ nexr = (v16u8)__msa_andi_b(tmp3, 0xF8);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
+ reg3 = (v16u8)__msa_andi_b(tmp3, 0x07);
+ reg0 = (v16u8)__msa_srli_b(tmp0, 5);
+ reg1 = (v16u8)__msa_slli_b(reg1, 3);
+ reg2 = (v16u8)__msa_srli_b(tmp2, 5);
+ reg3 = (v16u8)__msa_slli_b(reg3, 3);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ nexg = (v16u8)__msa_or_v(reg2, reg3);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_srli_b(tmpb, 2);
+ reg2 = (v16u8)__msa_slli_b(nexb, 3);
+ reg3 = (v16u8)__msa_srli_b(nexb, 2);
+ tmpb = (v16u8)__msa_or_v(reg1, reg0);
+ nexb = (v16u8)__msa_or_v(reg2, reg3);
+ reg0 = (v16u8)__msa_slli_b(tmpg, 2);
+ reg1 = (v16u8)__msa_srli_b(tmpg, 4);
+ reg2 = (v16u8)__msa_slli_b(nexg, 2);
+ reg3 = (v16u8)__msa_srli_b(nexg, 4);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ nexg = (v16u8)__msa_or_v(reg2, reg3);
+ reg0 = (v16u8)__msa_srli_b(tmpr, 5);
+ reg2 = (v16u8)__msa_srli_b(nexr, 5);
+ tmpr = (v16u8)__msa_or_v(tmpr, reg0);
+ nexr = (v16u8)__msa_or_v(nexr, reg2);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
+ res0 = __msa_copy_u_d((v2i64)dst, 0);
+ res1 = __msa_copy_u_d((v2i64)dst, 1);
SD(res0, dst_u);
SD(res1, dst_v);
s += 16;
@@ -2017,26 +2016,27 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
}
}
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
int64_t res0, res1;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2085,10 +2085,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h((v8i16)reg0, 2);
- reg1 = __msa_srai_h((v8i16)reg1, 2);
- reg2 = __msa_srai_h((v8i16)reg2, 2);
- reg3 = __msa_srai_h((v8i16)reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h((v8i16)reg0, 1);
+ reg1 = __msa_srai_h((v8i16)reg1, 1);
+ reg2 = __msa_srai_h((v8i16)reg2, 1);
+ reg3 = __msa_srai_h((v8i16)reg3, 1);
vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
@@ -2122,26 +2126,27 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
}
}
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
int64_t res0, res1;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2190,10 +2195,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h(reg0, 2);
- reg1 = __msa_srai_h(reg1, 2);
- reg2 = __msa_srai_h(reg2, 2);
- reg3 = __msa_srai_h(reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h(reg0, 1);
+ reg1 = __msa_srai_h(reg1, 1);
+ reg2 = __msa_srai_h(reg2, 1);
+ reg3 = __msa_srai_h(reg3, 1);
vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
@@ -2236,13 +2245,13 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 zero = {0};
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2251,8 +2260,7 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
val1 = LD(src_uv);
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@@ -2273,12 +2281,12 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
v16u8 zero = {0};
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2287,8 +2295,7 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
val1 = LD(src_uv);
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
vec0 = vec0 >> 3;
vec1 = (vec1 >> 2) << 5;
vec2 = (vec2 >> 3) << 11;
@@ -2309,14 +2316,14 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v16u8 zero = {0};
v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2326,8 +2333,7 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@@ -2416,27 +2422,27 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
}
}
-void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
- v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
- v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
+ v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
+ v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
- ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
@@ -2444,19 +2450,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
@@ -2464,19 +2470,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
@@ -2484,81 +2490,143 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 vec0, vec1, vec2, vec3;
- v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
- v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
- v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 dst0, dst1, dst2, dst3;
+ v16u8 zero = {0};
+ v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
+ v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
+ v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
+ v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
+ v4i32 shift = __msa_fill_w(0x00000008);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((void*)s, 0);
- src1 = (v16u8)__msa_ld_b((void*)s, 16);
- src2 = (v16u8)__msa_ld_b((void*)s, 32);
- src3 = (v16u8)__msa_ld_b((void*)s, 48);
- src4 = (v16u8)__msa_ld_b((void*)t, 0);
- src5 = (v16u8)__msa_ld_b((void*)t, 16);
- src6 = (v16u8)__msa_ld_b((void*)t, 32);
- src7 = (v16u8)__msa_ld_b((void*)t, 48);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec0 = __msa_aver_u_b(src4, src6);
- vec1 = __msa_aver_u_b(src5, src7);
- src0 = (v16u8)__msa_ld_b((void*)s, 64);
- src1 = (v16u8)__msa_ld_b((void*)s, 80);
- src2 = (v16u8)__msa_ld_b((void*)s, 96);
- src3 = (v16u8)__msa_ld_b((void*)s, 112);
- src4 = (v16u8)__msa_ld_b((void*)t, 64);
- src5 = (v16u8)__msa_ld_b((void*)t, 80);
- src6 = (v16u8)__msa_ld_b((void*)t, 96);
- src7 = (v16u8)__msa_ld_b((void*)t, 112);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec2 = __msa_aver_u_b(src4, src6);
- vec3 = __msa_aver_u_b(src5, src7);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
+ src1 = __msa_ld_b((void*)s, 0);
+ src3 = __msa_ld_b((void*)s, 16);
+ src5 = __msa_ld_b((void*)t, 0);
+ src7 = __msa_ld_b((void*)t, 16);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 32);
+ src3 = __msa_ld_b((void*)s, 48);
+ src5 = __msa_ld_b((void*)t, 32);
+ src7 = __msa_ld_b((void*)t, 48);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst0, dst1);
+
+ src1 = __msa_ld_b((void*)s, 64);
+ src3 = __msa_ld_b((void*)s, 80);
+ src5 = __msa_ld_b((void*)t, 64);
+ src7 = __msa_ld_b((void*)t, 80);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 96);
+ src3 = __msa_ld_b((void*)s, 112);
+ src5 = __msa_ld_b((void*)t, 96);
+ src7 = __msa_ld_b((void*)t, 112);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst2, dst3);
+
+ dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
+ dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
s += 128;
t += 128;
dst_v += 16;
@@ -2566,103 +2634,108 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
}
}
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
- s += 128;
- t += 128;
- dst_v += 16;
- dst_u += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3;
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
- v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, src0, src1, src2, src3);
- ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
+ v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2674,54 +2747,57 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
int width) {
int x;
v16u8 src0, src1, src2, dst0, dst1;
- v8u16 vec0, vec1, vec2;
+ v8i16 vec0, vec1, vec2;
v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 zero = {0};
+ v4i32 const_0x80 = __msa_fill_w(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
for (x = 0; x < width; x += 8) {
READI444(src_y, src_u, src_v, src0, src1, src2);
- vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg0 *= vec_yg;
reg1 *= vec_yg;
reg0 = __msa_srai_w(reg0, 16);
reg1 = __msa_srai_w(reg1, 16);
- reg4 = reg0 + vec_br;
- reg5 = reg1 + vec_br;
- reg2 = reg0 + vec_bg;
- reg3 = reg1 + vec_bg;
- reg0 += vec_bb;
- reg1 += vec_bb;
+ reg0 += vec_yb;
+ reg1 += vec_yb;
vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
- reg0 -= reg6 * vec_ub;
- reg1 -= reg7 * vec_ub;
- reg2 -= reg6 * vec_ug;
- reg3 -= reg7 * vec_ug;
- reg4 -= reg8 * vec_vr;
- reg5 -= reg9 * vec_vr;
- reg2 -= reg8 * vec_vg;
- reg3 -= reg9 * vec_vg;
- reg0 = __msa_srai_w(reg0, 6);
- reg1 = __msa_srai_w(reg1, 6);
- reg2 = __msa_srai_w(reg2, 6);
- reg3 = __msa_srai_w(reg3, 6);
- reg4 = __msa_srai_w(reg4, 6);
- reg5 = __msa_srai_w(reg5, 6);
+ reg6 -= const_0x80;
+ reg7 -= const_0x80;
+ reg8 -= const_0x80;
+ reg9 -= const_0x80;
+ tmp0 = reg0 + reg6 * vec_ub;
+ tmp1 = reg1 + reg7 * vec_ub;
+ tmp2 = reg0 + reg8 * vec_vr;
+ tmp3 = reg1 + reg9 * vec_vr;
+ tmp4 = reg6 * vec_ug;
+ tmp5 = reg7 * vec_ug;
+ tmp4 += reg8 * vec_vg;
+ tmp5 += reg9 * vec_vg;
+ tmp4 = reg0 - tmp4;
+ tmp5 = reg1 - tmp5;
+ reg0 = __msa_srai_w(tmp0, 6);
+ reg1 = __msa_srai_w(tmp1, 6);
+ reg2 = __msa_srai_w(tmp2, 6);
+ reg3 = __msa_srai_w(tmp3, 6);
+ reg4 = __msa_srai_w(tmp4, 6);
+ reg5 = __msa_srai_w(tmp5, 6);
CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
- vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
- vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
@@ -2734,13 +2810,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
}
}
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1;
v4i32 reg0, reg1, reg2, reg3;
- v4i32 vec_yg = __msa_fill_w(0x4A35);
- v8i16 vec_ygb = __msa_fill_h(0xFB78);
+ v4i32 vec_yg = __msa_fill_w(yg);
+ v8i16 vec_ygb = __msa_fill_h(ygb);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 max = __msa_ldi_h(0xFF);
v8i16 zero = {0};
@@ -2814,12 +2901,12 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2827,8 +2914,7 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
- YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_yuy2 += 16;
dst_argb += 32;
@@ -2842,12 +2928,12 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2855,8 +2941,7 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
- YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_uyvy += 16;
dst_argb += 32;
@@ -3001,12 +3086,12 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
}
}
-void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+void ARGBBlendRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int x;
- v16u8 src0, src1, src2, src3, dst0, dst1;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
v8u16 const_256 = (v8u16)__msa_ldi_h(256);
@@ -3015,8 +3100,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
@@ -3051,16 +3136,16 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
- vec0 += vec8;
- vec1 += vec9;
- vec2 += vec10;
- vec3 += vec11;
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
+ dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+ dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
+ dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
dst0 = __msa_bmnz_v(dst0, const_255, mask);
dst1 = __msa_bmnz_v(dst1, const_255, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
- src_argb0 += 32;
+ src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
@@ -3082,7 +3167,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
v16i8 zero = {0};
- for (x = 0; x < width; x += 8) {
+ for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
@@ -3315,10 +3400,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
}
}
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
diff --git a/source/row_neon.cc b/source/row_neon.cc
new file mode 100644
index 00000000..31142a90
--- /dev/null
+++ b/source/row_neon.cc
@@ -0,0 +1,3999 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are
+// reserved.
+
+// q0: Y uint16x8_t
+// d2: U uint8x8_t
+// d3: V uint8x8_t
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.32 {d2[0]}, [%[src_u]]! \n" \
+ "vld1.32 {d2[1]}, [%[src_v]]! \n" \
+ "vmov.u8 d1, d0 \n" \
+ "vmovl.u8 q1, d2 \n" \
+ "vzip.u8 d0, d1 \n" \
+ "vsli.u16 q1, q1, #8 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.8 {d2}, [%[src_u]]! \n" \
+ "vmovl.u8 q0, d0 \n" \
+ "vld1.8 {d3}, [%[src_v]]! \n" \
+ "vsli.u16 q0, q0, #8 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vmov.u8 q1, #128 \n" \
+ "vmovl.u8 q0, d0 \n" \
+ "vsli.u16 q0, q0, #8 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.8 {d2}, [%[src_uv]]! \n" \
+ "vmov.u8 d1, d0 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d0, d1 \n" \
+ "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \
+ "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.8 {d2}, [%[src_vu]]! \n" \
+ "vmov.u8 d1, d0 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d0, d1 \n" \
+ "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \
+ "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */
+
+// Read 8 YUY2
+#define READYUY2 \
+ "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \
+ "vmovl.u8 q0, d0 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vsli.u16 q0, q0, #8 \n" \
+ "vsli.u16 d2, d2, #8 \n" \
+ "vsri.u16 d3, d3, #8 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \
+ "vmovl.u8 q0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vsli.u16 q0, q0, #8 \n" \
+ "vsli.u16 d2, d2, #8 \n" \
+ "vsri.u16 d3, d3, #8 \n"
+
+// TODO: Use single register for kUVCoeff and multiply by lane
+#define YUVTORGB_SETUP \
+ "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \
+ "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
+ "vdup.u16 q10, d31[1] \n" \
+ "vdup.u16 q11, d31[2] \n" \
+ "vdup.u16 q12, d31[3] \n" \
+ "vdup.u16 d31, d31[0] \n"
+
+// q0: B uint16x8_t
+// q1: G uint16x8_t
+// q2: R uint16x8_t
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB \
+ "vmull.u16 q2, d1, d31 \n" \
+ "vmull.u8 q8, d3, d29 \n" /* DGV */ \
+ "vmull.u16 q0, d0, d31 \n" \
+ "vmlal.u8 q8, d2, d28 \n" /* DG */ \
+ "vqshrn.u32 d0, q0, #16 \n" \
+ "vqshrn.u32 d1, q2, #16 \n" /* Y */ \
+ "vmull.u8 q9, d2, d26 \n" /* DB */ \
+ "vmull.u8 q2, d3, d27 \n" /* DR */ \
+ "vadd.u16 q4, q0, q11 \n" /* G */ \
+ "vadd.u16 q2, q0, q2 \n" /* R */ \
+ "vadd.u16 q0, q0, q9 \n" /* B */ \
+ "vqsub.u16 q1, q4, q8 \n" /* G */ \
+ "vqsub.u16 q0, q0, q10 \n" /* B */ \
+ "vqsub.u16 q2, q2, q12 \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8 \
+ "vqshrn.u16 d4, q2, #6 \n" /* R */ \
+ "vqshrn.u16 d2, q1, #6 \n" /* G */ \
+ "vqshrn.u16 d0, q0, #6 \n" /* B */
+
+#define YUVTORGB_REGS \
+ "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
+
+#define STORERGBA \
+ "vmov.u8 d1, d0 \n" \
+ "vmov.u8 d3, d4 \n" \
+ "vmov.u8 d0, d6 \n" \
+ "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n"
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "vld1.8 {d6}, [%[src_a]]! \n"
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "vld1.8 {d6}, [%[src_a]]! \n"
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTORGB565 \
+ "vshll.u8 q2, d4, #8 \n" /* R */ \
+ "vshll.u8 q1, d2, #8 \n" /* G */ \
+ "vshll.u8 q0, d0, #8 \n" /* B */ \
+ "vsri.16 q2, q1, #5 \n" /* RG */ \
+ "vsri.16 q2, q0, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565
+ "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTOARGB1555 \
+ "vshll.u8 q3, d6, #8 \n" /* A */ \
+ "vshll.u8 q2, d4, #8 \n" /* R */ \
+ "vshll.u8 q1, d2, #8 \n" /* G */ \
+ "vshll.u8 q0, d0, #8 \n" /* B */ \
+ "vsri.16 q3, q2, #1 \n" /* AR */ \
+ "vsri.16 q3, q1, #6 \n" /* ARG */ \
+ "vsri.16 q3, q0, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vmov.u8 d6, #0xff \n" ARGBTOARGB1555
+ "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555.
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "q3");
+}
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d0, d0, #4 \n" /* B */ \
+ "vbic.32 d2, d2, d7 \n" /* G */ \
+ "vshr.u8 d4, d4, #4 \n" /* R */ \
+ "vbic.32 d6, d6, d7 \n" /* A */ \
+ "vorr d0, d0, d2 \n" /* BG */ \
+ "vorr d1, d4, d6 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "vmov.u8 d7, #0x0f \n" // vbic bits to clear
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "q3");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUV400 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV21 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_vu] "+r"(src_vu), // %[src_vu]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV21 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_vu] "+r"(src_vu), // %[src_vu]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n" ARGBTORGB565
+ "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUY2 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READUYVY YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "pld [%0, #1792] \n"
+ "vst1.8 {q0}, [%1]! \n" // store 16 bytes
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "q0" // Clobber List
+ );
+}
+
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "pld [%0, #3584] \n"
+ "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride * 2) // %3
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], %4 \n"
+ "subs %3, %3, #16 \n"
+ "pld [%0, #1792] \n"
+ "vst1.8 {d0}, [%1]! \n"
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(src_tile_stride) // %4
+ : "cc", "memory", "d0", "d1" // Clobber List
+ );
+}
+
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
+ "pld [%0, #1792] \n"
+ "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
+ "pld [%1, #1792] \n"
+ "subs %3, %3, #16 \n"
+ "vst2.8 {q0, q1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list
+ );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
+ "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
+ "subs %3, %3, #16 \n"
+ "pld [%0, #1792] \n"
+ "vzip.8 q0, q1 \n"
+ "pld [%1, #1792] \n"
+ "vst1.8 {q0, q1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list
+ );
+}
+#endif
+
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q14}, [%0]! \n" // Load lower bits.
+ "vld1.8 {q9}, [%0]! \n" // Load upper bits row
+ // by row.
+ "vld1.8 {q11}, [%0]! \n"
+ "vld1.8 {q13}, [%0]! \n"
+ "vld1.8 {q15}, [%0]! \n"
+ "vshl.u8 q8, q14, #6 \n" // Shift lower bit data
+ // appropriately.
+ "vshl.u8 q10, q14, #4 \n"
+ "vshl.u8 q12, q14, #2 \n"
+ "vzip.u8 q8, q9 \n" // Interleave upper and
+ // lower bits.
+ "vzip.u8 q10, q11 \n"
+ "vzip.u8 q12, q13 \n"
+ "vzip.u8 q14, q15 \n"
+ "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits
+ // into lower 6 bits for
+ // better accuracy in
+ // conversions.
+ "vsri.u16 q9, q9, #10 \n"
+ "vsri.u16 q10, q10, #10 \n"
+ "vsri.u16 q11, q11, #10 \n"
+ "vsri.u16 q12, q12, #10 \n"
+ "vsri.u16 q13, q13, #10 \n"
+ "vsri.u16 q14, q14, #10 \n"
+ "vsri.u16 q15, q15, #10 \n"
+ "vstmia %1!, {q8-q15} \n" // Store pixel block (64
+ // pixels).
+ "subs %2, %2, #80 \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(size) // %2
+ :
+ : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
+ "subs %5, %5, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%3]! \n" // store B
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%1]! \n" // store R
+ "vst1.8 {q3}, [%4]! \n" // store A
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q0}, [%2]! \n" // load B
+ "vld1.8 {q3}, [%3]! \n" // load A
+ "subs %5, %5, #16 \n" // 16 processed per loop
+ "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
+ "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%3]! \n" // store B
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%1]! \n" // store R
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 q3, #255 \n" // load A(255)
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q0}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB
+ "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void MergeXR30Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ int shift = 10 - depth;
+ asm volatile(
+ "vmov.u32 q14, #1023 \n"
+ "vdup.32 q15, %5 \n"
+ "1: \n"
+ "vld1.16 {d4}, [%2]! \n" // B
+ "vld1.16 {d2}, [%1]! \n" // G
+ "vld1.16 {d0}, [%0]! \n" // R
+ "vmovl.u16 q2, d4 \n" // B
+ "vmovl.u16 q1, d2 \n" // G
+ "vmovl.u16 q0, d0 \n" // R
+ "vshl.u32 q2, q2, q15 \n" // 000B
+ "vshl.u32 q1, q1, q15 \n"
+ "vshl.u32 q0, q0, q15 \n"
+ "vmin.u32 q2, q2, q14 \n"
+ "vmin.u32 q1, q1, q14 \n"
+ "vmin.u32 q0, q0, q14 \n"
+ "vsli.u32 q2, q1, #10 \n" // 00GB
+ "vsli.u32 q2, q0, #20 \n" // 0RGB
+ "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
+ "subs %4, %4, #4 \n"
+ "vst1.8 {q2}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "q0", "q1", "q2", "q14", "q15");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int /* depth */,
+ int width) {
+ asm volatile(
+ "vmov.u32 q14, #1023 \n"
+ "1: \n"
+ "vld1.16 {d4}, [%2]! \n" // B
+ "vld1.16 {d2}, [%1]! \n" // G
+ "vld1.16 {d0}, [%0]! \n" // R
+ "vmovl.u16 q2, d4 \n" // 000B
+ "vmovl.u16 q1, d2 \n" // G
+ "vmovl.u16 q0, d0 \n" // R
+ "vmin.u32 q2, q2, q14 \n"
+ "vmin.u32 q1, q1, q14 \n"
+ "vmin.u32 q0, q0, q14 \n"
+ "vsli.u32 q2, q1, #10 \n" // 00GB
+ "vsli.u32 q2, q0, #20 \n" // 0RGB
+ "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
+ "subs %4, %4, #4 \n"
+ "vst1.8 {q2}, [%3]! \n"
+ "bgt 1b \n"
+ "3: \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q14");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "vdup.u16 q15, %6 \n"
+ "vdup.u16 q14, %7 \n"
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vld1.16 {q3}, [%3]! \n" // A
+ "vmin.u16 q2, q2, q14 \n"
+ "vmin.u16 q1, q1, q14 \n"
+ "vmin.u16 q0, q0, q14 \n"
+ "vmin.u16 q3, q3, q14 \n"
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "vshl.u16 q3, q3, q15 \n"
+ "subs %5, %5, #8 \n"
+ "vst4.16 {d0, d2, d4, d6}, [%4]! \n"
+ "vst4.16 {d1, d3, d5, d7}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_ar64), // %4
+ "+r"(width) // %5
+ : "r"(shift), // %6
+ "r"(mask) // %7
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "vmov.u8 q3, #0xff \n" // A (0xffff)
+ "vdup.u16 q15, %5 \n"
+ "vdup.u16 q14, %6 \n"
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vmin.u16 q2, q2, q14 \n"
+ "vmin.u16 q1, q1, q14 \n"
+ "vmin.u16 q0, q0, q14 \n"
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "subs %4, %4, #8 \n"
+ "vst4.16 {d0, d2, d4, d6}, [%3]! \n"
+ "vst4.16 {d1, d3, d5, d7}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar64), // %3
+ "+r"(width) // %4
+ : "r"(shift), // %5
+ "r"(mask) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "vdup.16 q15, %6 \n"
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vld1.16 {q3}, [%3]! \n" // A
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "vshl.u16 q3, q3, q15 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d1, q1 \n"
+ "vqmovn.u16 d2, q2 \n"
+ "vqmovn.u16 d3, q3 \n"
+ "subs %5, %5, #8 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : "r"(shift) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "vdup.16 q15, %5 \n"
+ "vmov.u8 d6, #0xff \n" // A (0xff)
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "vqmovn.u16 d5, q2 \n"
+ "vqmovn.u16 d4, q1 \n"
+ "vqmovn.u16 d3, q0 \n"
+ "subs %4, %4, #8 \n"
+ "vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "q0", "q1", "q2", "d6", "q15");
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+ asm volatile(
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+ asm volatile(
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %2 \n"
+ "sub %0, %0, #32 \n" // 32 bytes per loop
+
+ "1: \n"
+ "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
+ "subs %2, #32 \n" // 32 pixels per loop.
+ "vrev64.8 q0, q2 \n"
+ "vrev64.8 q1, q1 \n"
+ "vswp d0, d1 \n"
+ "vswp d2, d3 \n"
+ "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #32 \n"
+
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vrev64.8 d3, d3 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ src_rgb24 += width * 3 - 24;
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"(-24) // %3
+ : "cc", "memory", "d0", "d1", "d2");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "vmov.u8 d0, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels.
+ "vst3.8 {d1, d3, d5}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 q4, q1, q3 \n" // average rows of UV
+ "vst1.8 {q4}, [%2]! \n" // store 8 UV.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "d6");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
+ int width) {
+ asm volatile(
+ "vdup.32 d7, %2 \n" // dither4
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d7 \n"
+ "vqadd.u8 d2, d2, d7 \n"
+ "vqadd.u8 d4, d4, d7 \n" // add for dither
+ ARGBTORGB565
+ "vst1.8 {q2}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
+ int width) {
+ asm volatile(
+ "vmov.u8 d7, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+struct RgbUVConstants {
+ uint8_t kRGBToU[4];
+ uint8_t kRGBToV[4];
+};
+
+// 8x1 pixels.
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width,
+ const struct RgbUVConstants* rgbuvconstants) {
+ asm volatile(
+
+ "vld1.8 {d0}, [%4] \n" // load rgbuvconstants
+ "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
+ "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient
+ "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient
+ "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient
+ "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+
+ "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned
+ "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(rgbuvconstants) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+ "q15");
+}
+
+// RGB to bt601 coefficients
+// UB 0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR 0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+ {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500 = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500 = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+ {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+ &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+ &kRgb24JPegUVConstants);
+}
+
+// clang-format off
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
+ "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \
+ "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match Intel code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_uj,
+ uint8_t* dst_vj,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_uj), // %2
+ "+r"(dst_vj), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q0, q4, #1 \n" // 2x average
+ "vrshr.u16 q1, q5, #1 \n"
+ "vrshr.u16 q2, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q2}, [%0]! \n"
+ "vmov.u8 q1, q0 \n"
+ "vmov.u8 q3, q2 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
+ "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+ "vld1.8 {q4}, [%3] \n" // shuffler
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q2}, [%0]! \n"
+ "vtbl.8 d2, {d0, d1}, d8 \n"
+ "vtbl.8 d3, {d0, d1}, d9 \n"
+ "vtbl.8 d6, {d4, d5}, d8 \n"
+ "vtbl.8 d7, {d4, d5}, d9 \n"
+ "vmov.u8 q0, q1 \n"
+ "vmov.u8 q2, q3 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
+ "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToABGR) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vld1.16 {q2}, [%0]! \n"
+ "vld1.16 {q3}, [%0]! \n"
+ "vshrn.u16 d0, q0, #8 \n"
+ "vshrn.u16 d1, q1, #8 \n"
+ "vshrn.u16 d4, q2, #8 \n"
+ "vshrn.u16 d5, q3, #8 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 4 pixels
+ "vst1.8 {q2}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vld1.8 {d8}, [%3] \n" // shuffler
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vld1.16 {q2}, [%0]! \n"
+ "vld1.16 {q3}, [%0]! \n"
+ "vtbl.8 d0, {d0, d1}, d8 \n"
+ "vtbl.8 d1, {d2, d3}, d8 \n"
+ "vtbl.8 d4, {d4, d5}, d8 \n"
+ "vtbl.8 d5, {d6, d7}, d8 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 4 pixels
+ "vst1.8 {q2}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleAB64ToARGB) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vld1.8 {d0}, [%3] \n" // load rgbconstants
+ "vdup.u8 d20, d0[0] \n"
+ "vdup.u8 d21, d0[1] \n"
+ "vdup.u8 d22, d0[2] \n"
+ "vdup.u16 q12, d0[2] \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vmull.u8 q8, d0, d20 \n" // B
+ "vmull.u8 q9, d1, d20 \n"
+ "vmlal.u8 q8, d2, d21 \n" // G
+ "vmlal.u8 q9, d3, d21 \n"
+ "vmlal.u8 q8, d4, d22 \n" // R
+ "vmlal.u8 q9, d5, d22 \n"
+ "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
+ "vaddhn.u16 d1, q9, q12 \n"
+ "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+ "q12");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vld1.8 {d0}, [%3] \n" // load rgbconstants
+ "vdup.u8 d20, d0[0] \n"
+ "vdup.u8 d21, d0[1] \n"
+ "vdup.u8 d22, d0[2] \n"
+ "vdup.u16 q12, d0[2] \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vmull.u8 q8, d2, d20 \n" // B
+ "vmull.u8 q9, d3, d20 \n"
+ "vmlal.u8 q8, d4, d21 \n" // G
+ "vmlal.u8 q9, d5, d21 \n"
+ "vmlal.u8 q8, d6, d22 \n" // R
+ "vmlal.u8 q9, d7, d22 \n"
+ "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
+ "vaddhn.u16 d1, q9, q12 \n"
+ "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+ "q12");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vld1.8 {d0}, [%3] \n" // load rgbconstants
+ "vdup.u8 d20, d0[0] \n"
+ "vdup.u8 d21, d0[1] \n"
+ "vdup.u8 d22, d0[2] \n"
+ "vdup.u16 q12, d0[2] \n"
+ "1: \n"
+ "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
+ // RGB24.
+ "vld3.8 {d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vmull.u8 q8, d2, d20 \n" // B
+ "vmull.u8 q9, d3, d20 \n"
+ "vmlal.u8 q8, d4, d21 \n" // G
+ "vmlal.u8 q9, d5, d21 \n"
+ "vmlal.u8 q8, d6, d22 \n" // R
+ "vmlal.u8 q9, d7, d22 \n"
+ "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
+ "vaddhn.u16 d1, q9, q12 \n"
+ "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+ "q12");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+
+ "vdup.16 d17, %4 \n"
+ "vdup.16 d16, %5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.16 {q0}, [%1]! \n"
+ "vld1.16 {q1}, [%2]! \n"
+ "subs %3, %3, #8 \n"
+ "vmull.u16 q2, d0, d16 \n"
+ "vmull.u16 q3, d1, d16 \n"
+ "vmlal.u16 q2, d2, d17 \n"
+ "vmlal.u16 q3, d3, d17 \n"
+ "vrshrn.u32 d0, q2, #8 \n"
+ "vrshrn.u32 d1, q3, #8 \n"
+ "vst1.16 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.16 {q0}, [%1]! \n"
+ "vld1.16 {q1}, [%2]! \n"
+ "subs %3, %3, #8 \n"
+ "vrhadd.u16 q0, q1 \n"
+ "vst1.16 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.16 {q0}, [%1]! \n"
+ "subs %3, %3, #8 \n"
+ "vst1.16 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width) // %3
+ : "r"(y1_fraction), // %4
+ "r"(y0_fraction) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u16 q15, #0x00ff \n" // 255 for rounding up
+
+ // Attenuate 8 pixels.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vaddhn.u16 d0, q10, q15 \n" // (b + 255) >> 8
+ "vaddhn.u16 d1, q11, q15 \n" // (g + 255) >> 8
+ "vaddhn.u16 d2, q12, q15 \n" // (r + 255) >> 8
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ asm volatile(
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+ "q14", "q15");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float /*unused*/,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(1.9259299444e-34f) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
+ "vld1.32 {q2}, [%0] \n"
+ "vadd.u32 q0, q0, q1 \n" // * 1
+ "vadd.u32 q1, q1, q2 \n" // * 1
+ "vld1.32 {q2, q3}, [%2]! \n"
+ "vmla.u32 q0, q2, q11 \n" // * 6
+ "vmla.u32 q1, q3, q11 \n" // * 6
+ "vld1.32 {q2, q3}, [%1]! \n"
+ "vld1.32 {q8, q9}, [%3]! \n"
+ "vadd.u32 q2, q2, q8 \n" // add rows for * 4
+ "vadd.u32 q3, q3, q9 \n"
+ "vmla.u32 q0, q2, q10 \n" // * 4
+ "vmla.u32 q1, q3, q10 \n" // * 4
+ "subs %5, %5, #8 \n" // 8 processed per loop
+ "vqshrn.u32 d0, q0, #8 \n" // round and pack
+ "vqshrn.u32 d1, q1, #8 \n"
+ "vst1.u16 {q0}, [%4]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d0, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d1, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
+ "vqrshrn.u16 d0, q0, #2 \n"
+ "vqrshrn.u16 d1, q1, #2 \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ int shift = depth - 16; // Negative for right shift.
+ asm volatile(
+ "vdup.16 q2, %4 \n"
+ "1: \n"
+ "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
+ "vshl.u16 q0, q0, q2 \n"
+ "vshl.u16 q1, q1, q2 \n"
+ "subs %3, %3, #8 \n" // 8 src pixels per loop
+ "vst1.16 {q0}, [%1]! \n" // store 8 U pixels
+ "vst1.16 {q1}, [%2]! \n" // store 8 V pixels
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "vdup.16 q2, %4 \n"
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // load 8 U
+ "vld1.16 {q1}, [%1]! \n" // load 8 V
+ "vshl.u16 q0, q0, q2 \n"
+ "vshl.u16 q1, q1, q2 \n"
+ "subs %3, %3, #8 \n" // 8 src pixels per loop
+ "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "vdup.16 q2, %3 \n"
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vmul.u16 q0, q0, q2 \n"
+ "vmul.u16 q1, q1, q2 \n"
+ "vst1.16 {q0}, [%1]! \n"
+ "vst1.16 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n" // 16 src pixels per loop
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "vdup.16 d8, %3 \n"
+ "1: \n"
+ "vld1.16 {q2, q3}, [%0]! \n"
+ "vmull.u16 q0, d4, d8 \n"
+ "vmull.u16 q1, d5, d8 \n"
+ "vmull.u16 q2, d6, d8 \n"
+ "vmull.u16 q3, d7, d8 \n"
+ "vshrn.u32 d0, q0, #16 \n"
+ "vshrn.u32 d1, q1, #16 \n"
+ "vshrn.u32 d2, q2, #16 \n"
+ "vshrn.u32 d3, q3, #16 \n"
+ "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
+ "subs %2, %2, #16 \n" // 16 src pixels per loop
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "d8");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+void Convert16To8Row_NEON(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+ asm volatile(
+ "vdup.16 q2, %3 \n"
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative
+ "vshl.u16 q1, q1, q2 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d1, q1 \n"
+ "subs %2, %2, #16 \n" // 16 src pixels per loop
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(shift) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
new file mode 100644
index 00000000..1679f87c
--- /dev/null
+++ b/source/row_neon64.cc
@@ -0,0 +1,4630 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
+// STn over ZIP1+ST1
+// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// v0.8h: Y
+// v1.16b: 8U, 8V
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "ldr d0, [%[src_y]], #8 \n" \
+ "ld1 {v1.s}[0], [%[src_u]], #4 \n" \
+ "ld1 {v1.s}[1], [%[src_v]], #4 \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "zip1 v1.16b, v1.16b, v1.16b \n" \
+ "prfm pldl1keep, [%[src_u], 128] \n" \
+ "prfm pldl1keep, [%[src_v], 128] \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "ldr d0, [%[src_y]], #8 \n" \
+ "ld1 {v1.d}[0], [%[src_u]], #8 \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "ld1 {v1.d}[1], [%[src_v]], #8 \n" \
+ "prfm pldl1keep, [%[src_u], 448] \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_v], 448] \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "ldr d0, [%[src_y]], #8 \n" \
+ "movi v1.16b, #128 \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n"
+
+static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
+ 1, 1, 3, 3, 5, 5, 7, 7};
+static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
+ 0, 0, 2, 2, 4, 4, 6, 6};
+
+// Read 8 Y and 4 UV from NV12 or NV21
+#define READNV12 \
+ "ldr d0, [%[src_y]], #8 \n" \
+ "ldr d1, [%[src_uv]], #8 \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "tbl v1.16b, {v1.16b}, v2.16b \n" \
+ "prfm pldl1keep, [%[src_uv], 448] \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_yuy2], 448] \n" \
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
+ "zip1 v0.16b, v4.16b, v4.16b \n" \
+ "prfm pldl1keep, [%[src_uyvy], 448] \n" \
+ "tbl v1.16b, {v3.16b}, v2.16b \n"
+
+// UB VR UG VG
+// YG BB BG BR
+#define YUVTORGB_SETUP \
+ "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
+ "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
+
+// v16.8h: B
+// v17.8h: G
+// v18.8h: R
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB \
+ "umull2 v3.4s, v0.8h, v24.8h \n" \
+ "umull v6.8h, v1.8b, v30.8b \n" \
+ "umull v0.4s, v0.4h, v24.4h \n" \
+ "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \
+ "uqshrn v0.4h, v0.4s, #16 \n" \
+ "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
+ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \
+ "add v17.8h, v0.8h, v26.8h \n" /* G */ \
+ "add v16.8h, v0.8h, v4.8h \n" /* B */ \
+ "add v18.8h, v0.8h, v5.8h \n" /* R */ \
+ "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
+ "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
+ "uqsub v18.8h, v18.8h, v27.8h \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8 \
+ "uqshrn v17.8b, v17.8h, #6 \n" \
+ "uqshrn v16.8b, v16.8h, #6 \n" \
+ "uqshrn v18.8b, v18.8h, #6 \n"
+
+#define YUVTORGB_REGS \
+ "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
+ "v26", "v27", "v28", "v29", "v30", "v31"
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n" /* A */
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n" /* A */
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n"
+ "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
+ "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n"
+ "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
+ "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v15.8b, #255 \n" /* A */
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v15");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTORGB565 \
+ "shll v18.8h, v18.8b, #8 \n" /* R */ \
+ "shll v17.8h, v17.8b, #8 \n" /* G */ \
+ "shll v16.8h, v16.8b, #8 \n" /* B */ \
+ "sri v18.8h, v17.8h, #5 \n" /* RG */ \
+ "sri v18.8h, v16.8h, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565
+ "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTOARGB1555 \
+ "shll v0.8h, v19.8b, #8 \n" /* A */ \
+ "shll v18.8h, v18.8b, #8 \n" /* R */ \
+ "shll v17.8h, v17.8b, #8 \n" /* G */ \
+ "shll v16.8h, v16.8b, #8 \n" /* B */ \
+ "sri v0.8h, v18.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v17.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v16.8h, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555
+ "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+#define ARGBTOARGB4444 \
+ /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \
+ "ushr v16.8b, v16.8b, #4 \n" /* B */ \
+ "bic v17.8b, v17.8b, v23.8b \n" /* G */ \
+ "ushr v18.8b, v18.8b, #4 \n" /* R */ \
+ "bic v19.8b, v19.8b, v23.8b \n" /* A */ \
+ "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \
+ "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \
+ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v23.16b, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "movi v19.8b, #255 \n" ARGBTOARGB4444
+ "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8
+ // pixels
+ // ARGB4444.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "1: \n" READYUV400 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+#if LIBYUV_USE_ST4
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v20", "v21", "v22", "v23");
+}
+#else
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #255 \n"
+ "1: \n"
+ "ldr d16, [%0], #8 \n"
+ "subs %w2, %w2, #8 \n"
+ "zip1 v18.16b, v16.16b, v16.16b \n" // YY
+ "zip1 v19.16b, v16.16b, v20.16b \n" // YA
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
+ "zip2 v17.16b, v18.16b, v19.16b \n"
+ "stp q16, q17, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20");
+}
+#endif // LIBYUV_USE_ST4
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_vu), // %[src_uv]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV21Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_vu), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV21Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n" ARGBTORGB565
+ "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
+ // pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READYUY2 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READUYVY YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "v0" // Clobber List
+ );
+}
+
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead
+ "st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride * 2) // %3
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8b,v1.8b}, [%0], %4 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%0, 1792] \n"
+ "st1 {v0.8b}, [%1], #8 \n"
+ "st1 {v1.8b}, [%2], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(src_tile_stride) // %4
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 1792] \n"
+ "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
+ "prfm pldl1keep, [%1, 1792] \n"
+ "subs %w3, %w3, #16 \n" // store 8 YUY2
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "v0", "v1" // Clobber list
+ );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
+ "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%0, 1792] \n"
+ "zip1 v2.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 1792] \n"
+ "zip2 v3.16b, v0.16b, v1.16b \n"
+ "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list
+ );
+}
+#endif
+
+// Unpack MT2T into tiled P010 64 pixels at a time. See
+// tinyurl.com/mtk-10bit-video-format for format documentation.
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v7.16b}, [%0], #16 \n"
+ "ld1 {v0.16b-v3.16b}, [%0], #64 \n"
+ "shl v4.16b, v7.16b, #6 \n"
+ "shl v5.16b, v7.16b, #4 \n"
+ "shl v6.16b, v7.16b, #2 \n"
+ "subs %2, %2, #80 \n"
+ "zip1 v16.16b, v4.16b, v0.16b \n"
+ "zip1 v18.16b, v5.16b, v1.16b \n"
+ "zip1 v20.16b, v6.16b, v2.16b \n"
+ "zip1 v22.16b, v7.16b, v3.16b \n"
+ "zip2 v17.16b, v4.16b, v0.16b \n"
+ "zip2 v19.16b, v5.16b, v1.16b \n"
+ "zip2 v21.16b, v6.16b, v2.16b \n"
+ "zip2 v23.16b, v7.16b, v3.16b \n"
+ "sri v16.8h, v16.8h, #10 \n"
+ "sri v17.8h, v17.8h, #10 \n"
+ "sri v18.8h, v18.8h, #10 \n"
+ "sri v19.8h, v19.8h, #10 \n"
+ "st1 {v16.8h-v19.8h}, [%1], #64 \n"
+ "sri v20.8h, v20.8h, #10 \n"
+ "sri v21.8h, v21.8h, #10 \n"
+ "sri v22.8h, v22.8h, #10 \n"
+ "sri v23.8h, v23.8h, #10 \n"
+ "st1 {v20.8h-v23.8h}, [%1], #64 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(size) // %2
+ :
+ : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+}
+
+#if LIBYUV_USE_ST2
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v2.8h, %w4 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "ushl v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2");
+}
+#else
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "zip1 v2.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip2 v3.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v4.8h, %w4 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "ushl v0.8h, v0.8h, v4.8h \n"
+ "ushl v1.8h, v1.8h, v4.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v2.8h, v0.8h, v1.8h \n"
+ "zip2 v3.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
+}
+#endif // LIBYUV_USE_ST2
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%3], #16 \n" // store B
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%1], #16 \n" // store R
+ "st1 {v3.16b}, [%4], #16 \n" // store A
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+#if LIBYUV_USE_ST4
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v3.16b}, [%3], #16 \n" // load A
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+#else
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v3.16b}, [%3], #16 \n" // load A
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%2, 448] \n"
+ "zip1 v4.16b, v0.16b, v1.16b \n" // BG
+ "zip1 v5.16b, v2.16b, v3.16b \n" // RA
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip2 v6.16b, v0.16b, v1.16b \n" // BG
+ "zip2 v7.16b, v2.16b, v3.16b \n" // RA
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
+ "zip2 v1.8h, v4.8h, v5.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "zip1 v2.8h, v6.8h, v7.8h \n"
+ "zip2 v3.8h, v6.8h, v7.8h \n"
+ "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+#endif // LIBYUV_USE_ST4
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%3], #16 \n" // store B
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%1], #16 \n" // store R
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.16b, #255 \n" // load A(255)
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void MergeXR30Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ int shift = 10 - depth;
+ asm volatile(
+ "movi v30.16b, #255 \n"
+ "ushr v30.4s, v30.4s, #22 \n" // 1023
+ "dup v31.4s, %w5 \n"
+ "1: \n"
+ "ldr d2, [%2], #8 \n" // B
+ "ldr d1, [%1], #8 \n" // G
+ "ldr d0, [%0], #8 \n" // R
+ "ushll v2.4s, v2.4h, #0 \n" // B
+ "ushll v1.4s, v1.4h, #0 \n" // G
+ "ushll v0.4s, v0.4h, #0 \n" // R
+ "ushl v2.4s, v2.4s, v31.4s \n" // 000B
+ "ushl v1.4s, v1.4s, v31.4s \n" // G
+ "ushl v0.4s, v0.4s, v31.4s \n" // R
+ "umin v2.4s, v2.4s, v30.4s \n"
+ "umin v1.4s, v1.4s, v30.4s \n"
+ "umin v0.4s, v0.4s, v30.4s \n"
+ "sli v2.4s, v1.4s, #10 \n" // 00GB
+ "sli v2.4s, v0.4s, #20 \n" // 0RGB
+ "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
+ "subs %w4, %w4, #4 \n"
+ "str q2, [%3], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int /* depth */,
+ int width) {
+ asm volatile(
+ "movi v30.16b, #255 \n"
+ "ushr v30.4s, v30.4s, #22 \n" // 1023
+ "1: \n"
+ "ldr d2, [%2], #8 \n" // B
+ "ldr d1, [%1], #8 \n" // G
+ "ldr d0, [%0], #8 \n" // R
+ "ushll v2.4s, v2.4h, #0 \n" // 000B
+ "ushll v1.4s, v1.4h, #0 \n" // G
+ "ushll v0.4s, v0.4h, #0 \n" // R
+ "umin v2.4s, v2.4s, v30.4s \n"
+ "umin v1.4s, v1.4s, v30.4s \n"
+ "umin v0.4s, v0.4s, v30.4s \n"
+ "sli v2.4s, v1.4s, #10 \n" // 00GB
+ "sli v2.4s, v0.4s, #20 \n" // 0RGB
+ "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
+ "subs %w4, %w4, #4 \n"
+ "str q2, [%3], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v30");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "dup v30.8h, %w7 \n"
+ "dup v31.8h, %w6 \n"
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "ldr q3, [%3], #16 \n" // A
+ "umin v2.8h, v2.8h, v30.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umin v1.8h, v1.8h, v30.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umin v0.8h, v0.8h, v30.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umin v3.8h, v3.8h, v30.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "ushl v3.8h, v3.8h, v31.8h \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_ar64), // %4
+ "+r"(width) // %5
+ : "r"(shift), // %6
+ "r"(mask) // %7
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "movi v3.16b, #0xff \n" // A (0xffff)
+ "dup v30.8h, %w6 \n"
+ "dup v31.8h, %w5 \n"
+
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "umin v2.8h, v2.8h, v30.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umin v1.8h, v1.8h, v30.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umin v0.8h, v0.8h, v30.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar64), // %3
+ "+r"(width) // %4
+ : "r"(shift), // %5
+ "r"(mask) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "dup v31.8h, %w6 \n"
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "ldr q3, [%3], #16 \n" // A
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "ushl v3.8h, v3.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v3.8b, v3.8h \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : "r"(shift) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "dup v31.8h, %w5 \n"
+ "movi v3.8b, #0xff \n" // A (0xff)
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+ asm volatile(
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+ asm volatile(
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "v0");
+}
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v3.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #32 \n" // 32 pixels per loop.
+ "tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirror) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorUV) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(&kShuffleMirrorUV) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+ 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "ld1 {v3.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #48 \n"
+
+ "1: \n"
+ "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v0.16b, {v0.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v2.16b}, v3.16b \n"
+ "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-48), // %3
+ "r"(&kShuffleMirror) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v4.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
+ // RGB24.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v5.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "movi v0.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v2.8b, v4.8b, v4.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v1.8b, v5.8b, v5.8b \n" // move r
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
+ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
+ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
+ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
+ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
+ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
+ "dup v2.2D, v0.D[1] \n" /* R */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
+ \
+ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
+ "xtn2 v3.16b, v2.8h \n" \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
+ "dup v1.2D, v0.D[1] \n" \
+ "dup v3.2D, v2.D[1] \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
+ "dup v1.2D, v0.D[1] \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
+#define ARGB4444TOARGB \
+ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
+ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
+ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
+ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
+ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
+ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
+ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
+ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
+ "dup v0.2D, v2.D[1] \n" \
+ "dup v1.2D, v3.D[1] \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "prfm pldl1keep, [%0, 448] \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "prfm pldl1keep, [%0, 448] \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(src_uyvyb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row
+ "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v4.16b}, [%2], #16 \n" // store 8 UV.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565
+ "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v16", "v17", "v18", "v19");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
+ int width) {
+ asm volatile(
+ "dup v1.4s, %w3 \n" // dither4
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uqadd v16.8b, v16.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v17.8b, v17.8b, v1.8b \n"
+ "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb), // %1
+ "+r"(width) // %2
+ : "r"(dither4) // %3
+ : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
+ int width) {
+ asm volatile(
+ "movi v23.16b, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
+}
+
+#if LIBYUV_USE_ST2
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q0, q2, [%0], #32 \n" // load 8 pixels
+ "mov v1.16b, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mov v3.16b, v2.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
+ "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+ "ldr q4, [%3] \n" // shuffler
+ "1: \n"
+ "ldp q0, q2, [%0], #32 \n" // load 8 pixels
+ "tbl v0.16b, {v0.16b}, v4.16b \n"
+ "tbl v2.16b, {v2.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mov v1.16b, v0.16b \n"
+ "mov v3.16b, v2.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
+ "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToABGR) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+#else
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "zip1 v2.16b, v0.16b, v0.16b \n"
+ "zip2 v3.16b, v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v4.16b, v1.16b, v1.16b \n"
+ "zip2 v5.16b, v1.16b, v1.16b \n"
+ "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
+static const uvec8 kShuffleARGBToAB64[2] = {
+ {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
+ {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+ "ldp q6, q7, [%3] \n" // 2 shufflers
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
+ "tbl v3.16b, {v0.16b}, v7.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v4.16b, {v1.16b}, v6.16b \n"
+ "tbl v5.16b, {v1.16b}, v7.16b \n"
+ "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToAB64[0]) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+#endif // LIBYUV_USE_ST2
+
+static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31};
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "ldr q4, [%3] \n" // shuffler
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 4 pixels
+ "ldp q2, q3, [%0], #32 \n" // load 4 pixels
+ "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "stp q0, q2, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleAR64ToARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15,
+ 21, 19, 17, 23, 29, 27, 25, 31};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "ldr q4, [%3] \n" // shuffler
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 4 pixels
+ "ldp q2, q3, [%0], #32 \n" // load 4 pixels
+ "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "stp q0, q2, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleAB64ToARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+struct RgbUVConstants {
+ uint8_t kRGBToU[4];
+ uint8_t kRGBToV[4];
+};
+
+// 8x1 pixels.
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width,
+ const struct RgbUVConstants* rgbuvconstants) {
+ asm volatile(
+ "ldr d0, [%4] \n" // load rgbuvconstants
+ "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
+ "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient
+ "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient
+ "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient
+ "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient
+ "movi v29.16b, #0x80 \n" // 128.5
+
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "prfm pldl1keep, [%0, 448] \n"
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+
+ "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned
+ "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(rgbuvconstants) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+ "v27", "v28", "v29");
+}
+
+// RGB to bt601 coefficients
+// UB 0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR 0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+ {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500 = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500 = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+ {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+ &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+ &kRgb24JPegUVConstants);
+}
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// clang-format off
+#define RGBTOUV(QB, QG, QR) \
+ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
+ "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// TODO(fbarchard): Subsample match Intel code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_uj,
+ uint8_t* dst_vj,
+ int width) {
+ const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_abgr_1), // %1
+ "+r"(dst_uj), // %2
+ "+r"(dst_vj), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_rgb24_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_raw_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_bgra_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v2.8h, v1.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_abgr_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_rgba_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_rgb24_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels.
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_raw_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_rgb565_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_argb1555_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+ asm volatile(
+ RGBTOUV_SETUP_REG // sets v20-v25
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_argb4444_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28"
+
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+ "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "ldr d0, [%3] \n" // load rgbconstants
+ "dup v6.16b, v0.b[0] \n"
+ "dup v7.16b, v0.b[1] \n"
+ "dup v16.16b, v0.b[2] \n"
+ "dup v17.8h, v0.h[2] \n"
+ "1: \n"
+ "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "umull v0.8h, v2.8b, v6.8b \n" // B
+ "umull2 v1.8h, v2.16b, v6.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v0.8h, v3.8b, v7.8b \n" // G
+ "umlal2 v1.8h, v3.16b, v7.16b \n"
+ "umlal v0.8h, v4.8b, v16.8b \n" // R
+ "umlal2 v1.8h, v4.16b, v16.16b \n"
+ "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
+ "addhn v1.8b, v1.8h, v17.8h \n"
+ "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17");
+}
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "ldr d0, [%3] \n" // load rgbconstants
+ "dup v6.16b, v0.b[0] \n"
+ "dup v7.16b, v0.b[1] \n"
+ "dup v16.16b, v0.b[2] \n"
+ "dup v17.8h, v0.h[2] \n"
+ "1: \n"
+ "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "umull v0.8h, v2.8b, v6.8b \n" // B
+ "umull2 v1.8h, v2.16b, v6.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v0.8h, v3.8b, v7.8b \n" // G
+ "umlal2 v1.8h, v3.16b, v7.16b \n"
+ "umlal v0.8h, v4.8b, v16.8b \n" // R
+ "umlal2 v1.8h, v4.16b, v16.16b \n"
+ "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
+ "addhn v1.8b, v1.8h, v17.8h \n"
+ "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "ldr d0, [%3] \n" // load rgbconstants
+ "dup v5.16b, v0.b[0] \n"
+ "dup v6.16b, v0.b[1] \n"
+ "dup v7.16b, v0.b[2] \n"
+ "dup v16.8h, v0.h[2] \n"
+ "1: \n"
+ "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "umull v0.8h, v2.8b, v5.8b \n" // B
+ "umull2 v1.8h, v2.16b, v5.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v0.8h, v3.8b, v6.8b \n" // G
+ "umlal2 v1.8h, v3.16b, v6.16b \n"
+ "umlal v0.8h, v4.8b, v7.8b \n" // R
+ "umlal2 v1.8h, v4.16b, v7.16b \n"
+ "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y
+ "addhn v1.8b, v1.8h, v16.8h \n"
+ "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction), // %4
+ "+r"(y0_fraction) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.8h, %w4 \n"
+ "dup v4.8h, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "umull v2.4s, v0.4h, v4.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull2 v3.4s, v0.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal v2.4s, v1.4h, v5.4h \n"
+ "umlal2 v3.4s, v1.8h, v5.8h \n"
+ "rshrn v0.4h, v2.4s, #8 \n"
+ "rshrn2 v0.8h, v3.4s, #8 \n"
+ "st1 {v0.8h}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "urhadd v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.8h}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v0.8h}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width) // %3
+ : "r"(y1_fraction), // %4
+ "r"(y0_fraction) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
+// Bilinear filter 8x2 -> 8x1
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+
+ asm volatile(
+ "dup v6.8h, %w6 \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.8h, %w4 \n"
+ "dup v4.8h, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "umull v2.4s, v0.4h, v4.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull2 v3.4s, v0.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal v2.4s, v1.4h, v5.4h \n"
+ "umlal2 v3.4s, v1.8h, v5.8h \n"
+ "rshrn v0.4h, v2.4s, #8 \n"
+ "rshrn2 v0.8h, v3.4s, #8 \n"
+ "ushl v0.8h, v0.8h, v6.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%0], #8 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "urhadd v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "ushl v0.8h, v0.8h, v6.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%0], #8 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ldr q0, [%1], #16 \n"
+ "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "str d0, [%0], #8 \n" // store 8 pixels
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width) // %3
+ : "r"(y1_fraction), // %4
+ "r"(y0_fraction), // %5
+ "r"(shift) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
+
+ "89: \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
+ // ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
+ // ARGB1.
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v7.8h, #0x00ff \n" // 255 for rounding up
+
+ // Attenuate 8 pixels.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "addhn v0.8b, v4.8h, v7.8h \n" // (b + 255) >> 8
+ "addhn v1.8b, v5.8h, v7.8h \n" // (g + 255) >> 8
+ "addhn v2.8b, v6.8h, v7.8h \n" // (r + 255) >> 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ asm volatile(
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v24.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v25.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v26.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v22", "v23", "v24", "v25");
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "prfm pldl1keep, [%1, 448] \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "prfm pldl1keep, [%2, 448] \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2LL), // %5
+ "r"(6LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1LL), // %4
+ "r"(6LL) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float /*unused*/,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
+ float* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats
+ "subs %w2, %w2, #8 \n" // 8 floats per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "fcvtl v2.4s, v1.4h \n" // 8 floats
+ "fcvtl2 v3.4s, v1.8h \n"
+ "stp q2, q3, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP16 Half Floats to FP32 Floats
+// Read a column and write a row
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16
+ int src_stride, // stride in elements
+ float* dst,
+ int width) {
+ asm volatile(
+ "cmp %w2, #8 \n" // Is there 8 rows?
+ "b.lo 2f \n"
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats
+ "ld1 {v0.h}[1], [%0], %3 \n"
+ "ld1 {v0.h}[2], [%0], %3 \n"
+ "ld1 {v0.h}[3], [%0], %3 \n"
+ "ld1 {v1.h}[0], [%0], %3 \n"
+ "ld1 {v1.h}[1], [%0], %3 \n"
+ "ld1 {v1.h}[2], [%0], %3 \n"
+ "ld1 {v1.h}[3], [%0], %3 \n"
+ "subs %w2, %w2, #8 \n" // 8 rows per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "fcvtl v2.4s, v0.4h \n" // 4 floats
+ "fcvtl v3.4s, v1.4h \n" // 4 more floats
+ "stp q2, q3, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
+ "cmp %w2, #1 \n" // Is there 1 value?
+ "b.lo 3f \n"
+ "2: \n"
+ "ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats
+ "subs %w2, %w2, #1 \n" // 1 floats per loop
+ "fcvtl v2.4s, v1.4h \n" // 1 floats
+ "str s2, [%1], #4 \n" // store 1 floats
+ "b.gt 2b \n"
+ "3: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)(src_stride * 2)) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+ uint16_t* dst, // fp16
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q2, q3, [%0], #32 \n" // load 8 floats
+ "subs %w2, %w2, #8 \n" // 8 floats per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 fp16 halffloats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "str q1, [%1], #16 \n" // store 8 fp16 halffloats
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width) {
+ float fmax;
+ asm volatile(
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width), // %2
+ "=w"(fmax) // %3
+ : "w"(scale) // %4
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width) {
+ float fsum;
+ asm volatile(
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width), // %2
+ "=w"(fsum) // %3
+ : "w"(scale) // %4
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ : "r"(32LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ asm volatile(
+ "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
+ "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
+ "fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ : "r"(&kGaussCoefficients) // %7
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+ asm volatile(
+ "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
+ // rows
+ "fadd v0.4s, v0.4s, v1.4s \n" // * 1
+ "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
+ "fadd v1.4s, v1.4s, v2.4s \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
+ "fadd v2.4s, v2.4s, v4.4s \n"
+ "fadd v3.4s, v3.4s, v5.4s \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v0.4s, v0.4s, v8.4s \n" // / 256
+ "fmul v1.4s, v1.4s, v8.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kGaussCoefficients), // %3
+ "r"(8LL), // %4
+ "r"(-4LL), // %5
+ "r"(20LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+#if LIBYUV_USE_ST3
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+#else
+static const uvec8 kYUV24Shuffle[3] = {
+ {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
+ {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
+ {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
+ "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
+ "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Shuffle[0]) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+#endif // LIBYUV_USE_ST3
+
+// Note ST2 8b version is faster than zip+ST1
+
+// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleSwapUV) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ int shift = depth - 16; // Negative for right shift.
+ asm volatile(
+ "dup v2.8h, %w4 \n"
+ "1: \n"
+ "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ushl v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels
+ "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "dup v2.8h, %w3 \n"
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "mul v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mul v1.8h, v1.8h, v2.8h \n"
+ "stp q0, q1, [%1], #32 \n" // store 16 pixels
+ "subs %w2, %w2, #16 \n" // 16 src pixels per loop
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "dup v4.8h, %w3 \n"
+ "1: \n"
+ "ldp q2, q3, [%0], #32 \n"
+ "umull v0.4s, v2.4h, v4.4h \n"
+ "umull2 v1.4s, v2.8h, v4.8h \n"
+ "umull v2.4s, v3.4h, v4.4h \n"
+ "umull2 v3.4s, v3.8h, v4.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "shrn v0.4h, v0.4s, #16 \n"
+ "shrn2 v0.8h, v1.4s, #16 \n"
+ "shrn v1.4h, v2.4s, #16 \n"
+ "shrn2 v1.8h, v3.4s, #16 \n"
+ "stp q0, q1, [%1], #32 \n" // store 16 pixels
+ "subs %w2, %w2, #16 \n" // 16 src pixels per loop
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+void Convert16To8Row_NEON(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+ asm volatile(
+ "dup v2.8h, %w3 \n"
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn2 v0.16b, v1.8h \n"
+ "subs %w2, %w2, #16 \n" // 16 src pixels per loop
+ "str q0, [%1], #16 \n" // store 16 pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(shift) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
new file mode 100644
index 00000000..0bf2bef6
--- /dev/null
+++ b/source/row_rvv.cc
@@ -0,0 +1,1394 @@
+/*
+ * Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+ defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+// register) is set to round-to-nearest-up mode(0).
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+ { \
+ asm volatile("csrwi vxrm, 0"); \
+ ub = yuvconst->kUVCoeff[0]; \
+ vr = yuvconst->kUVCoeff[1]; \
+ ug = yuvconst->kUVCoeff[2]; \
+ vg = yuvconst->kUVCoeff[3]; \
+ yg = yuvconst->kRGBCoeffBias[0]; \
+ bb = yuvconst->kRGBCoeffBias[1] + 32; \
+ bg = yuvconst->kRGBCoeffBias[2] - 32; \
+ br = yuvconst->kRGBCoeffBias[3] + 32; \
+ }
+
+// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
+#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+ { \
+ vuint8m1_t v_tmp0, v_tmp1; \
+ vuint8m2_t v_y; \
+ vuint16m2_t v_u_16, v_v_16; \
+ vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
+ v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \
+ v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
+ v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \
+ v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
+ v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
+ v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
+ v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
+ v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
+ vl = __riscv_vsetvl_e8m2(w); \
+ v_y = __riscv_vle8_v_u8m2(src_y, vl); \
+ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
+ }
+
+// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
+#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+ { \
+ vuint8m2_t v_y; \
+ vl = __riscv_vsetvl_e8m2(w); \
+ v_y = __riscv_vle8_v_u8m2(src_y, vl); \
+ v_u = __riscv_vle8_v_u8m2(src_u, vl); \
+ v_v = __riscv_vle8_v_u8m2(src_v, vl); \
+ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
+ }
+
+// Convert from YUV to fixed point RGB
+#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
+ v_b_16, v_r_16) \
+ { \
+ vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
+ vuint32m8_t v_tmp5; \
+ v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \
+ v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \
+ v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \
+ v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \
+ v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \
+ v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \
+ v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \
+ v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \
+ v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \
+ v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \
+ v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \
+ v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \
+ }
+
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
+ { \
+ v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \
+ v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \
+ v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \
+ }
+
+// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
+#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \
+ { \
+ vuint8m1_t v_tmp0, v_tmp1; \
+ vuint8m2_t v_y; \
+ vuint16m2_t v_u_16, v_v_16; \
+ vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
+ __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \
+ v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
+ v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
+ v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
+ v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
+ v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
+ v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
+ vl = __riscv_vsetvl_e8m2(w); \
+ v_y = __riscv_vle8_v_u8m2(src_y, vl); \
+ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
+ }
+
+// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
+#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \
+ { \
+ vuint8m1_t v_tmp0, v_tmp1; \
+ vuint8m2_t v_y; \
+ vuint16m2_t v_u_16, v_v_16; \
+ vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
+ __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \
+ v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
+ v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
+ v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
+ v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
+ v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
+ v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
+ vl = __riscv_vsetvl_e8m2(w); \
+ v_y = __riscv_vle8_v_u8m2(src_y, vl); \
+ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
+ }
+
+#ifdef HAS_ARGBTOAR64ROW_RVV
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+ size_t avl = (size_t)4 * width;
+ do {
+ vuint16m8_t v_ar64;
+ vuint8m4_t v_argb;
+ size_t vl = __riscv_vsetvl_e8m4(avl);
+ v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
+ v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
+ v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
+ __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
+ avl -= vl;
+ src_argb += vl;
+ dst_ar64 += vl;
+ } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_RVV
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+ size_t avl = (size_t)width;
+ do {
+ vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+ vuint8m1_t v_b, v_g, v_r, v_a;
+ size_t vl = __riscv_vsetvl_e8m1(avl);
+ __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
+ v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
+ v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
+ v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
+ v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
+ v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
+ v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
+ v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
+ __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl);
+ avl -= vl;
+ src_argb += 4 * vl;
+ dst_ab64 += 4 * vl;
+ } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_RVV
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+ size_t avl = (size_t)4 * width;
+ do {
+ vuint16m8_t v_ar64;
+ vuint8m4_t v_argb;
+ size_t vl = __riscv_vsetvl_e16m8(avl);
+ v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
+ v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
+ __riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
+ avl -= vl;
+ src_ar64 += vl;
+ dst_argb += vl;
+ } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_AR64TOAB64ROW_RVV
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
+ uint16_t* dst_ab64,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e16m2(w);
+ vuint16m2_t v_b, v_g, v_r, v_a;
+ __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl);
+ __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl);
+ w -= vl;
+ src_ar64 += vl * 4;
+ dst_ab64 += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_RVV
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+ size_t avl = (size_t)width;
+ do {
+ vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+ vuint8m1_t v_b, v_g, v_r, v_a;
+ size_t vl = __riscv_vsetvl_e16m2(avl);
+ __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl);
+ v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
+ v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
+ v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
+ v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
+ __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+ avl -= vl;
+ src_ab64 += 4 * vl;
+ dst_argb += 4 * vl;
+ } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_RAWTOARGBROW_RVV
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ vuint8m2_t v_b, v_g, v_r;
+ __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_raw += vl * 3;
+ dst_argb += vl * 4;
+ vl = __riscv_vsetvl_e8m2(w);
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RAWTORGBAROW_RVV
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ vuint8m2_t v_b, v_g, v_r;
+ __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_raw += vl * 3;
+ dst_rgba += vl * 4;
+ vl = __riscv_vsetvl_e8m2(w);
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RAWTORGB24ROW_RVV
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
+ w -= vl;
+ src_raw += vl * 3;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_RVV
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_raw += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_RVV
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOABGRROW_RVV
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a, v_r, v_g, v_b;
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_abgr += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOBGRAROW_RVV
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a, v_r, v_g, v_b;
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_bgra += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORGBAROW_RVV
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a, v_r, v_g, v_b;
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_rgba += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGBATOARGBROW_RVV
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a, v_r, v_g, v_b;
+ __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_rgba += vl * 4;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGB24TOARGBROW_RVV
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ vuint8m2_t v_b, v_g, v_r;
+ __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_rgb24 += vl * 3;
+ dst_argb += vl * 4;
+ vl = __riscv_vsetvl_e8m2(w);
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444TOARGBROW_RVV
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl;
+ src_v += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444ALPHATOARGBROW_RVV
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ do {
+ READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ v_a = __riscv_vle8_v_u8m2(src_a, vl);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_a += vl;
+ src_u += vl;
+ src_v += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444TORGB24ROW_RVV
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ do {
+ READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl;
+ src_v += vl;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TOARGBROW_RVV
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422ALPHATOARGBROW_RVV
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ do {
+ READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ v_a = __riscv_vle8_v_u8m2(src_a, vl);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_a += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TORGBAROW_RVV
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_rgba += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TORGB24ROW_RVV
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ do {
+ READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I400TOARGBROW_RVV
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
+ vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ vuint16m4_t v_yb;
+ vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
+ // To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) sets to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ if (is_yb_positive) {
+ v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
+ } else {
+ v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
+ }
+ do {
+ vuint8m2_t v_y, v_out;
+ vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
+ vl = __riscv_vsetvl_e8m2(w);
+ v_y = __riscv_vle8_v_u8m2(src_y, vl);
+ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
+ v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y
+ v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
+ if (is_yb_positive) {
+ v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
+ } else {
+ v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
+ }
+ v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_J400TOARGBROW_RVV
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ vuint8m2_t v_y;
+ v_y = __riscv_vle8_v_u8m2(src_y, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ dst_argb += vl * 4;
+ vl = __riscv_vsetvl_e8m2(w);
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_COPYROW_RVV
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(w);
+ vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+ __riscv_vse8_v_u8m8(dst, v_data, vl);
+ w -= vl;
+ src += vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TOARGBROW_RVV
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_uv += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TORGB24ROW_RVV
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ do {
+ READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_uv += vl;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TOARGBROW_RVV
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_vu += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TORGB24ROW_RVV
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ uint8_t ub, vr, ug, vg;
+ int16_t yg, bb, bg, br;
+ vuint8m2_t v_u, v_v;
+ vuint8m2_t v_b, v_g, v_r;
+ vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+ do {
+ READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_vu += vl;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+
+#ifdef HAS_INTERPOLATEROW_RVV
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ size_t dst_w = (size_t)dst_width;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+ // Blend 100 / 0 - Copy row unchanged.
+ if (y1_fraction == 0) {
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(dst_w);
+ __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+ dst_w -= vl;
+ src_ptr += vl;
+ dst_ptr += vl;
+ } while (dst_w > 0);
+ return;
+ }
+ // To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up(0).
+ asm volatile("csrwi vxrm, 0");
+ // Blend 50 / 50.
+ if (y1_fraction == 128) {
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(dst_w);
+ vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+ vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+ // Use round-to-nearest-up mode for averaging add
+ vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+ __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+ dst_w -= vl;
+ src_ptr += vl;
+ src_ptr1 += vl;
+ dst_ptr += vl;
+ } while (dst_w > 0);
+ return;
+ }
+ // General purpose row blend.
+ do {
+ size_t vl = __riscv_vsetvl_e8m4(dst_w);
+ vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+ vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+ vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+ acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+ // Use round-to-nearest-up mode for vnclip
+ __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+ dst_w -= vl;
+ src_ptr += vl;
+ src_ptr1 += vl;
+ dst_ptr += vl;
+ } while (dst_w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITRGBROW_RVV
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
+ __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+ __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+ __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+ w -= vl;
+ dst_r += vl;
+ dst_g += vl;
+ dst_b += vl;
+ src_rgb += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGERGBROW_RVV
+void MergeRGBRow_RVV(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+ vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+ vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+ __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
+ w -= vl;
+ src_r += vl;
+ src_g += vl;
+ src_b += vl;
+ dst_rgb += vl * 3;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITARGBROW_RVV
+void SplitARGBRow_RVV(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+ __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+ __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+ __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+ w -= vl;
+ dst_a += vl;
+ dst_r += vl;
+ dst_g += vl;
+ dst_b += vl;
+ src_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEARGBROW_RVV
+void MergeARGBRow_RVV(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+ vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+ vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+ vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_r += vl;
+ src_g += vl;
+ src_b += vl;
+ src_a += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_RVV
+void SplitXRGBRow_RVV(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+ __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+ __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+ w -= vl;
+ dst_r += vl;
+ dst_g += vl;
+ dst_b += vl;
+ src_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_RVV
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+ do {
+ vuint8m2_t v_r, v_g, v_b;
+ v_r = __riscv_vle8_v_u8m2(src_r, vl);
+ v_g = __riscv_vle8_v_u8m2(src_g, vl);
+ v_b = __riscv_vle8_v_u8m2(src_b, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_r += vl;
+ src_g += vl;
+ src_b += vl;
+ dst_argb += vl * 4;
+ vl = __riscv_vsetvl_e8m2(w);
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITUVROW_RVV
+void SplitUVRow_RVV(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ vuint8m4_t v_u, v_v;
+ __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+ __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+ __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+ w -= vl;
+ dst_u += vl;
+ dst_v += vl;
+ src_uv += 2 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEUVROW_RVV
+void MergeUVRow_RVV(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m4_t v_u, v_v;
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ v_u = __riscv_vle8_v_u8m4(src_u, vl);
+ v_v = __riscv_vle8_v_u8m4(src_v, vl);
+ __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+ w -= vl;
+ src_u += vl;
+ src_v += vl;
+ dst_uv += 2 * vl;
+ } while (w > 0);
+}
+#endif
+
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+ uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+ 128,
+ 0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080,
+ 0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+ 0x1080,
+ 0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored
+#ifdef HAS_ARGBTOYMATRIXROW_RVV
+void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ assert(width != 0);
+ size_t w = (size_t)width;
+ vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
+ vuint16m4_t v_addy; // vector is to store kAddY
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+ v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+ v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+ v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+ vuint16m4_t v_y_u16;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+ v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+ v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+ v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+ v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+ __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+ w -= vl;
+ src_argb += 4 * vl;
+ dst_y += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOYROW_RVV
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_ARGBTOYJROW_RVV
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_ABGRTOYROW_RVV
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+#ifdef HAS_ABGRTOYJROW_RVV
+void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+#endif
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+#ifdef HAS_RGBATOYMATRIXROW_RVV
+void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ assert(width != 0);
+ size_t w = (size_t)width;
+ vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
+ vuint16m4_t v_addy; // vector is to store kAddY
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+ v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+ v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+ v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+ vuint16m4_t v_y_u16;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+ v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+ v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+ v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+ v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+ v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+ __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+ w -= vl;
+ src_rgba += 4 * vl;
+ dst_y += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGBATOYROW_RVV
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_RGBATOYJROW_RVV
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_BGRATOYROW_RVV
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+#ifdef HAS_RGBTOYMATRIXROW_RVV
+void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ assert(width != 0);
+ size_t w = (size_t)width;
+ vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
+ vuint16m4_t v_addy; // vector is to store kAddY
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+ v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+ v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+ v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_y;
+ vuint16m4_t v_y_u16;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl);
+ v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+ v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+ v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+ v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+ v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+ __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+ w -= vl;
+ src_rgb += 3 * vl;
+ dst_y += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGB24TOYJROW_RVV
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_RAWTOYJROW_RVV
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+#endif
+
+#ifdef HAS_RGB24TOYROW_RVV
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_RAWTOYROW_RVV
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
+// src_argb: RGB values have already been pre-multiplied by the a.
+#ifdef HAS_ARGBBLENDROW_RVV
+void ARGBBlendRow_RVV(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ size_t w = (size_t)width;
+ size_t vl = __riscv_vsetvlmax_e8m2();
+ // clamp255((((256 - a) * b) >> 8) + f)
+ // = b * (256 - a) / 256 + f
+ // = b - (b * a / 256) + f
+ vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
+ do {
+ vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
+ vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
+ vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
+ vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
+ vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
+ src_argb, vl);
+ __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
+ src_argb1, vl);
+
+ v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
+ v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
+ v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
+
+ v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
+ v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
+ v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
+
+ v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
+ v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
+ v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
+
+ w -= vl;
+ src_argb += 4 * vl;
+ src_argb1 += 4 * vl;
+ dst_argb += 4 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_BLENDPLANEROW_RVV
+void BlendPlaneRow_RVV(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint16m8_t v_dst_u16;
+ vuint8m4_t v_dst;
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+ vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+ vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
+ vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
+
+ // (a * foreground) + (1-a) * background
+ v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
+ v_dst_u16 =
+ __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
+ v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
+ v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
+
+ __riscv_vse8_v_u8m4(dst, v_dst, vl);
+ w -= vl;
+ src0 += vl;
+ src1 += vl;
+ alpha += vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+// Attenuate: (f * a + 255) >> 8
+#ifdef HAS_ARGBATTENUATEROW_RVV
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ // f * a
+ v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
+ v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
+ v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
+ // f * a + 255
+ v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
+ v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
+ v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
+ // (f * a + 255) >> 8
+ v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
+ v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
+ v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
+ __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_RVV
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_b, v_g, v_r, v_a;
+ __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+ __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+ w -= vl;
+ src_argb += vl * 4;
+ dst_a += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+ size_t w = (size_t)width;
+ const ptrdiff_t dst_stride = 4;
+ dst += 3;
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(w);
+ vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
+ __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
+ w -= vl;
+ src += vl;
+ dst += vl * dst_stride;
+ } while (w > 0);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+ // defined(__clang__)
diff --git a/files/source/row_win.cc b/source/row_win.cc
index 27e3da7b..5fb28521 100644
--- a/files/source/row_win.cc
+++ b/source/row_win.cc
@@ -10,11 +10,13 @@
#include "libyuv/row.h"
-// This module is for Visual C 32/64 bit and clangcl 32 bit
+// This module is for Visual C 32/64 bit
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
- (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+ !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
-#if defined(_M_X64)
+#if defined(_M_ARM64EC)
+#include <intrin.h>
+#elif defined(_M_X64)
#include <emmintrin.h>
#include <tmmintrin.h> // For _mm_maddubs_epi16
#endif
@@ -27,12 +29,34 @@ extern "C" {
// 64 bit
#if defined(_M_X64)
+// Read 8 UV from 444
+#define READYUV444 \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ u_buf += 8; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8;
+
+// Read 8 UV from 444, With 8 Alpha.
+#define READYUVA444 \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ u_buf += 8; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+ a_buf += 8;
+
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -40,10 +64,10 @@ extern "C" {
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -52,24 +76,21 @@ extern "C" {
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+#define YUVTORGB(yuvconstants) \
+ xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
+ xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
+ xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
+ xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
+ xmm0 = _mm_adds_epi16(xmm4, xmm0); \
+ xmm1 = _mm_subs_epi16(xmm4, xmm1); \
+ xmm2 = _mm_adds_epi16(xmm4, xmm2); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2);
// Store 8 ARGB values.
@@ -90,7 +111,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
@@ -110,7 +131,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA422
@@ -121,6 +142,44 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
}
#endif
+#if defined(HAS_I444TOARGBROW_SSSE3)
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+ while (width > 0) {
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+ while (width > 0) {
+ READYUVA444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
// 32 bit
#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
@@ -187,11 +246,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+// 8 bit fixed point 0.5, for bias of UV.
+static const ulvec8 kBiasUV128 = {
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
@@ -836,7 +895,7 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_rgb,
- const uint32_t dither4,
+ uint32_t dither4,
int width) {
__asm {
@@ -883,7 +942,7 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_rgb,
- const uint32_t dither4,
+ uint32_t dither4,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1367,7 +1426,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
}
}
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1380,7 +1439,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1439,7 +1498,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1452,7 +1511,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUVJ128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
@@ -1513,7 +1572,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1526,7 +1585,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1581,7 +1640,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1594,9 +1653,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ vbroadcastf128 ymm5, xmmword ptr kBiasUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
convertloop:
@@ -1649,7 +1708,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
@@ -1659,7 +1718,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1707,7 +1766,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1720,7 +1779,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v
@@ -1779,7 +1838,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1792,7 +1851,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v
@@ -1851,7 +1910,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1864,7 +1923,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v
@@ -1926,137 +1985,153 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
// Read 16 UV from 444
#define READYUV444_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
+// Read 16 UV from 444. With 16 Alpha.
+#define READYUVA444_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vpermq ymm5, ymm5, 0xd8 \
+ __asm lea ebp, [ebp + 16]}
+
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
#define READYUVA422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
__asm vpermq ymm5, ymm5, 0xd8 \
__asm lea ebp, [ebp + 16]}
// Read 8 UV from NV12, upsample to 16 UV.
#define READNV12_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 UV from NV21, upsample to 16 UV.
#define READNV21_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 32]}
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
#define READUYVY_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 32]}
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) \
- __asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
- __asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
- __asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm { \
+ __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \
+ __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \
+ __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \
+ __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \
+ __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \
+ __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \
+ __asm vpaddw ymm4, ymm3, ymm4 \
+ __asm vpaddsw ymm0, ymm0, ymm4 \
+ __asm vpsubsw ymm1, ymm4, ymm1 \
+ __asm vpaddsw ymm2, ymm2, ymm4 \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
- }
+ __asm vpackuswb ymm0, ymm0, ymm0 \
+ __asm vpackuswb ymm1, ymm1, ymm1 \
+ __asm vpackuswb ymm2, ymm2, ymm2}
// Store 16 ARGB values.
#define STOREARGB_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
- __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
__asm vmovdqu 0[edx], ymm1 \
__asm vmovdqu 32[edx], ymm0 \
__asm lea edx, [edx + 64]}
// Store 16 RGBA values.
#define STORERGBA_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+ __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm1 \
__asm lea edx, [edx + 64]}
@@ -2183,6 +2258,48 @@ __declspec(naked) void I444ToARGBRow_AVX2(
}
#endif // HAS_I444TOARGBROW_AVX2
+#ifdef HAS_I444ALPHATOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I444AlphaToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+ convertloop:
+ READYUVA444_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I444AlphaTOARGBROW_AVX2
+
#ifdef HAS_NV12TOARGBROW_AVX2
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2361,191 +2478,202 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Read 8 UV from 444.
#define READYUV444 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
+// Read 4 UV from 444. With 8 Alpha.
+#define READYUVA444 \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm lea ebp, [ebp + 8]}
+
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm movd xmm3, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm movd xmm3, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
__asm lea ebp, [ebp + 8]}
// Read 4 UV from NV12, upsample to 8 UV.
#define READNV12 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 VU from NV21, upsample to 8 UV.
#define READNV21 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm pshufb xmm3, xmmword ptr kShuffleNV21 \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
#define READYUY2 \
- __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 16]}
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
#define READUYVY \
- __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 16]}
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \
- __asm { \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
- __asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
- __asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
- __asm psubw xmm2, xmm3 \
+ __asm { \
+ __asm psubb xmm3, xmmword ptr kBiasUV128 \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
+ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm pmaddubsw xmm0, xmm3 \
+ __asm pmaddubsw xmm1, xmm3 \
+ __asm pmaddubsw xmm2, xmm3 \
+ __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \
+ __asm paddw xmm4, xmm3 \
+ __asm paddsw xmm0, xmm4 \
+ __asm paddsw xmm2, xmm4 \
+ __asm psubsw xmm4, xmm1 \
+ __asm movdqa xmm1, xmm4 \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// Store 8 ARGB values.
#define STOREARGB \
- __asm { \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm5 /* RA */ \
+ __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm0 \
__asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32]}
// Store 8 BGRA values.
#define STOREBGRA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm0 /* GB */ \
- __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32]}
// Store 8 RGBA values.
#define STORERGBA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm2 /* GR */ \
- __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32]}
// Store 8 RGB24 values.
#define STORERGB24 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
- __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
- __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
- __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
- __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24]}
// Store 8 RGB565 values.
#define STORERGB565 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
- __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
- __asm movdqa xmm2, xmm0 /* G */ \
- __asm pslld xmm0, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm0, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm0, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm0, xmm3 /* BGR */ \
- __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
- __asm movdqa xmm2, xmm1 /* G */ \
- __asm pslld xmm1, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm1, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm1, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm1, xmm3 /* BGR */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
__asm packssdw xmm0, xmm1 \
- __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
__asm lea edx, [edx + 16]}
// 8 pixels.
@@ -2586,6 +2714,46 @@ __declspec(naked) void I444ToARGBRow_SSSE3(
}
// 8 pixels.
+// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
+__declspec(naked) void I444AlphaToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA444
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked) void I422ToRGB24Row_SSSE3(
const uint8_t* y_buf,
@@ -2623,6 +2791,44 @@ __declspec(naked) void I422ToRGB24Row_SSSE3(
}
}
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I444ToRGB24Row_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+ READYUV444
+ YUVTORGB(ebx)
+ STORERGB24
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
__declspec(naked) void I422ToRGB565Row_SSSE3(
@@ -2898,10 +3104,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
}
#endif // HAS_I422TOARGBROW_SSSE3
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2949,6 +3157,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
// note: vpunpcklbw mutates and vpackuswb unmutates.
__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -3045,15 +3254,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
}
#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
@@ -3078,7 +3287,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
ret
}
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
@@ -3254,17 +3463,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
sub edx, eax
convertloop:
- vmovdqu ymm0, [eax] // read 32 U's
- vmovdqu ymm1, [eax + edx] // and 32 V's
- lea eax, [eax + 32]
- vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
- vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
- vextractf128 [edi], ymm2, 0 // bytes 0..15
- vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
- vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
- vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
- lea edi, [edi + 64]
- sub ecx, 32
+ vpmovzxbw ymm0, [eax]
+ vpmovzxbw ymm1, [eax + edx]
+ lea eax, [eax + 16]
+ vpsllw ymm1, ymm1, 8
+ vpor ymm2, ymm1, ymm0
+ vmovdqu [edi], ymm2
+ lea edi, [edi + 32]
+ sub ecx, 16
jg convertloop
pop edi
@@ -4172,13 +4378,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4267,7 +4473,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
@@ -4312,7 +4518,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@@ -4406,7 +4612,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@@ -4762,20 +4968,20 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
@@ -4783,8 +4989,8 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
@@ -4802,13 +5008,13 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4817,11 +5023,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop49
convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb + src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4832,11 +5038,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop19
convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
+ movd xmm0, [eax] // read 1 pixels from src_argb
lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb + src_argb1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@@ -4851,23 +5057,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ psubusb xmm0, xmm1 // src_argb - src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4881,20 +5087,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
lea esi, [esi + 32]
@@ -4902,8 +5108,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4
vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -4919,19 +5125,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
lea esi, [esi + 32]
@@ -4949,21 +5155,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -5450,7 +5656,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
// 1 pixel loop
l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes.
+ movd xmm2, dword ptr [eax] // 1 argb pixel
lea eax, [eax + 4]
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
diff --git a/files/source/scale.cc b/source/scale.cc
index ab085496..b7a602ba 100644
--- a/files/source/scale.cc
+++ b/source/scale.cc
@@ -17,6 +17,7 @@
#include "libyuv/planar_functions.h" // For CopyPlane
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h" // For UVScale
#ifdef __cplusplus
namespace libyuv {
@@ -28,6 +29,7 @@ static __inline int Abs(int v) {
}
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
// Scale plane, 1/2
// This is an optimized version for scaling down a plane to 1/2 of
@@ -49,7 +51,7 @@ static void ScalePlaneDown2(int src_width,
? ScaleRowDown2_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
- int row_stride = src_stride << 1;
+ int row_stride = src_stride * 2;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -118,21 +120,29 @@ static void ScalePlaneDown2(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEROWDOWN2_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
ScaleRowDown2 =
filtering == kFilterNone
- ? ScaleRowDown2_Any_MMI
- : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
- : ScaleRowDown2Box_Any_MMI);
- if (IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+ ? ScaleRowDown2_Any_LSX
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX
+ : ScaleRowDown2Box_Any_LSX);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX
: (filtering == kFilterLinear
- ? ScaleRowDown2Linear_MMI
- : ScaleRowDown2Box_MMI);
+ ? ScaleRowDown2Linear_LSX
+ : ScaleRowDown2Box_LSX);
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowDown2 = filtering == kFilterNone
+ ? ScaleRowDown2_RVV
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_RVV
+ : ScaleRowDown2Box_RVV);
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -161,7 +171,7 @@ static void ScalePlaneDown2_16(int src_width,
? ScaleRowDown2_16_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
: ScaleRowDown2Box_16_C);
- int row_stride = src_stride << 1;
+ int row_stride = src_stride * 2;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -184,14 +194,6 @@ static void ScalePlaneDown2_16(int src_width,
: ScaleRowDown2Box_16_SSE2);
}
#endif
-#if defined(HAS_SCALEROWDOWN2_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
- : (filtering == kFilterLinear
- ? ScaleRowDown2Linear_16_MMI
- : ScaleRowDown2Box_16_MMI);
- }
-#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -204,6 +206,51 @@ static void ScalePlaneDown2_16(int src_width,
}
}
+void ScalePlaneDown2_16To8(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width, int scale) =
+ (src_width & 1)
+ ? (filtering == kFilterNone
+ ? ScaleRowDown2_16To8_Odd_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_Odd_C
+ : ScaleRowDown2Box_16To8_Odd_C))
+ : (filtering == kFilterNone
+ ? ScaleRowDown2_16To8_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C
+ : ScaleRowDown2Box_16To8_C));
+ int row_stride = src_stride * 2;
+ (void)dst_height;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < src_height / 2; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width, scale);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+ if (src_height & 1) {
+ if (!filtering) {
+ src_ptr -= src_stride; // Point to last row.
+ }
+ ScaleRowDown2(src_ptr, 0, dst_ptr, dst_width, scale);
+ }
+}
+
// Scale plane, 1/4
// This is an optimized version for scaling down a plane to 1/4 of
// its original size.
@@ -221,7 +268,7 @@ static void ScalePlaneDown4(int src_width,
void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
uint8_t* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
- int row_stride = src_stride << 2;
+ int row_stride = src_stride * 4;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -264,15 +311,20 @@ static void ScalePlaneDown4(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN4_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEROWDOWN4_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
ScaleRowDown4 =
- filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
- if (IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+ filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX;
}
}
#endif
+#if defined(HAS_SCALEROWDOWN4_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_RVV : ScaleRowDown4_RVV;
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -297,7 +349,7 @@ static void ScalePlaneDown4_16(int src_width,
void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
uint16_t* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
- int row_stride = src_stride << 2;
+ int row_stride = src_stride * 4;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -316,11 +368,6 @@ static void ScalePlaneDown4_16(int src_width,
filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
}
#endif
-#if defined(HAS_SCALEROWDOWN4_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
- }
-#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -398,6 +445,26 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN34_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_LSX;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX;
+ }
+ if (dst_width % 48 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_LSX;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX;
+ }
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
@@ -418,6 +485,17 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN34_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_RVV;
+ ScaleRowDown34_1 = ScaleRowDown34_RVV;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_RVV;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_RVV;
+ }
+ }
+#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -613,6 +691,37 @@ static void ScalePlaneDown38(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN38_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_LSX;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_LSX;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_RVV;
+ ScaleRowDown38_2 = ScaleRowDown38_RVV;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
+ }
+ }
+#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -746,9 +855,11 @@ static void ScaleAddCols2_C(int dst_width,
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
- *dst_ptr++ =
- SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
- 16;
+ int scaletbl_index = boxwidth - minboxwidth;
+ assert((scaletbl_index == 0) || (scaletbl_index == 1));
+ *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) *
+ scaletbl[scaletbl_index] >>
+ 16);
}
}
@@ -768,9 +879,10 @@ static void ScaleAddCols2_16_C(int dst_width,
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
- *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
- scaletbl[boxwidth - minboxwidth] >>
- 16;
+ int scaletbl_index = boxwidth - minboxwidth;
+ assert((scaletbl_index == 0) || (scaletbl_index == 1));
+ *dst_ptr++ =
+ SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16;
}
}
@@ -785,7 +897,7 @@ static void ScaleAddCols0_C(int dst_width,
(void)dx;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
- *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+ *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16);
}
}
@@ -800,7 +912,7 @@ static void ScaleAddCols1_C(int dst_width,
int i;
x >>= 16;
for (i = 0; i < dst_width; ++i) {
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+ *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
x += boxwidth;
}
}
@@ -827,14 +939,14 @@ static void ScaleAddCols1_16_C(int dst_width,
// one pixel of destination using fixed point (16.16) to step
// through source, sampling a box of pixel with simple
// averaging.
-static void ScalePlaneBox(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint8_t* src_ptr,
- uint8_t* dst_ptr) {
+static int ScalePlaneBox(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -848,6 +960,8 @@ static void ScalePlaneBox(int src_width,
{
// Allocate a row buffer of uint16_t.
align_buffer_64(row16, src_width * 2);
+ if (!row16)
+ return 1;
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16_t* src_ptr, uint8_t* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C
@@ -886,19 +1000,24 @@ static void ScalePlaneBox(int src_width,
}
}
#endif
-#if defined(HAS_SCALEADDROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleAddRow = ScaleAddRow_Any_MMI;
- if (IS_ALIGNED(src_width, 8)) {
- ScaleAddRow = ScaleAddRow_MMI;
+#if defined(HAS_SCALEADDROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleAddRow = ScaleAddRow_Any_LSX;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_LSX;
}
}
#endif
+#if defined(HAS_SCALEADDROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleAddRow = ScaleAddRow_RVV;
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
- const uint8_t* src = src_ptr + iy * src_stride;
+ const uint8_t* src = src_ptr + iy * (int64_t)src_stride;
y += dy;
if (y > max_y) {
y = max_y;
@@ -914,16 +1033,17 @@ static void ScalePlaneBox(int src_width,
}
free_aligned_buffer_64(row16);
}
+ return 0;
}
-static void ScalePlaneBox_16(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint16_t* src_ptr,
- uint16_t* dst_ptr) {
+static int ScalePlaneBox_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -937,6 +1057,8 @@ static void ScalePlaneBox_16(int src_width,
{
// Allocate a row buffer of uint32_t.
align_buffer_64(row32, src_width * 4);
+ if (!row32)
+ return 1;
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32_t* src_ptr, uint16_t* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
@@ -949,15 +1071,10 @@ static void ScalePlaneBox_16(int src_width,
}
#endif
-#if defined(HAS_SCALEADDROW_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
- ScaleAddRow = ScaleAddRow_16_MMI;
- }
-#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
- const uint16_t* src = src_ptr + iy * src_stride;
+ const uint16_t* src = src_ptr + iy * (int64_t)src_stride;
y += dy;
if (y > max_y) {
y = max_y;
@@ -973,18 +1090,19 @@ static void ScalePlaneBox_16(int src_width,
}
free_aligned_buffer_64(row32);
}
+ return 0;
}
// Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint8_t* src_ptr,
- uint8_t* dst_ptr,
- enum FilterMode filtering) {
+static int ScalePlaneBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
@@ -993,13 +1111,15 @@ void ScalePlaneBilinearDown(int src_width,
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row buffer.
align_buffer_64(row, src_width);
+ if (!row)
+ return 1;
const int max_y = (src_height - 1) << 16;
int j;
- void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
- void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1038,14 +1158,19 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
#if defined(HAS_SCALEFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1068,13 +1193,21 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_SCALEFILTERCOLS_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_LSX;
+ }
+ }
+#endif
if (y > max_y) {
y = max_y;
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint8_t* src = src_ptr + yi * src_stride;
+ const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
if (filtering == kFilterLinear) {
ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
} else {
@@ -1089,17 +1222,18 @@ void ScalePlaneBilinearDown(int src_width,
}
}
free_aligned_buffer_64(row);
+ return 0;
}
-void ScalePlaneBilinearDown_16(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint16_t* src_ptr,
- uint16_t* dst_ptr,
- enum FilterMode filtering) {
+static int ScalePlaneBilinearDown_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
@@ -1108,13 +1242,15 @@ void ScalePlaneBilinearDown_16(int src_width,
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row buffer.
align_buffer_64(row, src_width * 2);
+ if (!row)
+ return 1;
const int max_y = (src_height - 1) << 16;
int j;
- void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
- void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1123,7 +1259,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
+ InterpolateRow = InterpolateRow_16_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
@@ -1131,7 +1267,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ InterpolateRow = InterpolateRow_16_Any_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
@@ -1139,7 +1275,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
+ InterpolateRow = InterpolateRow_16_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
@@ -1147,7 +1283,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
+ InterpolateRow = InterpolateRow_16_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
@@ -1165,7 +1301,7 @@ void ScalePlaneBilinearDown_16(int src_width,
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint16_t* src = src_ptr + yi * src_stride;
+ const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
if (filtering == kFilterLinear) {
ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
} else {
@@ -1180,18 +1316,19 @@ void ScalePlaneBilinearDown_16(int src_width,
}
}
free_aligned_buffer_64(row);
+ return 0;
}
// Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint8_t* src_ptr,
- uint8_t* dst_ptr,
- enum FilterMode filtering) {
+static int ScalePlaneBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
int j;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1199,10 +1336,10 @@ void ScalePlaneBilinearUp(int src_width,
int dx = 0;
int dy = 0;
const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_C : ScaleCols_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1233,6 +1370,11 @@ void ScalePlaneBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
if (filtering && src_width >= 32768) {
ScaleFilterCols = ScaleFilterCols64_C;
@@ -1258,6 +1400,14 @@ void ScalePlaneBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEFILTERCOLS_LSX)
+ if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_LSX;
+ }
+ }
+#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
@@ -1265,11 +1415,6 @@ void ScalePlaneBilinearUp(int src_width,
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleFilterCols = ScaleColsUp2_MMI;
- }
-#endif
}
if (y > max_y) {
@@ -1277,14 +1422,16 @@ void ScalePlaneBilinearUp(int src_width,
}
{
int yi = y >> 16;
- const uint8_t* src = src_ptr + yi * src_stride;
+ const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
// Allocate 2 row buffers.
- const int kRowSize = (dst_width + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+ const int row_size = (dst_width + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
uint8_t* rowptr = row;
- int rowstride = kRowSize;
+ int rowstride = row_size;
int lasty = yi;
ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1292,7 +1439,9 @@ void ScalePlaneBilinearUp(int src_width,
src += src_stride;
}
ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
+ if (src_height > 2) {
+ src += src_stride;
+ }
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
@@ -1300,14 +1449,16 @@ void ScalePlaneBilinearUp(int src_width,
if (y > max_y) {
y = max_y;
yi = y >> 16;
- src = src_ptr + yi * src_stride;
+ src = src_ptr + yi * (int64_t)src_stride;
}
if (yi != lasty) {
ScaleFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
- src += src_stride;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
}
}
if (filtering == kFilterLinear) {
@@ -1321,17 +1472,355 @@ void ScalePlaneBilinearUp(int src_width,
}
free_aligned_buffer_64(row);
}
+ return 0;
}
-void ScalePlaneBilinearUp_16(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint16_t* src_ptr,
- uint16_t* dst_ptr,
- enum FilterMode filtering) {
+// Scale plane, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of I422 to I444.
+static void ScalePlaneUp2_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+ ScaleRowUp2_Linear_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ (void)src_width;
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowUp = ScaleRowUp2_Linear_RVV;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of I420 to I444.
+static void ScalePlaneUp2_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_Any_C;
+ int x;
+
+ (void)src_width;
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+ if (TestCpuFlag(kCpuHasRVV)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_RVV;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO(fbarchard): Test performance of writing one row of destination at a
+ // time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+// Scale at most 14 bit plane, horizontally up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I210 to I410 and I212 to I412.
+static void ScalePlaneUp2_12_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ (void)src_width;
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale at most 12 bit plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I010 to I410 and I012 to I412.
+static void ScalePlaneUp2_12_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ (void)src_width;
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+static void ScalePlaneUp2_16_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ (void)src_width;
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+static void ScalePlaneUp2_16_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ (void)src_width;
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+static int ScalePlaneBilinearUp_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
int j;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1339,10 +1828,10 @@ void ScalePlaneBilinearUp_16(int src_width,
int dx = 0;
int dy = 0;
const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
- void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1351,7 +1840,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
+ InterpolateRow = InterpolateRow_16_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
@@ -1359,7 +1848,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ InterpolateRow = InterpolateRow_16_Any_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
@@ -1367,7 +1856,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
+ InterpolateRow = InterpolateRow_16_Any_AVX2;
if (IS_ALIGNED(dst_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
@@ -1375,7 +1864,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
+ InterpolateRow = InterpolateRow_16_Any_NEON;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
@@ -1397,34 +1886,31 @@ void ScalePlaneBilinearUp_16(int src_width,
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleFilterCols = ScaleColsUp2_16_MMI;
- }
-#endif
}
-
if (y > max_y) {
y = max_y;
}
{
int yi = y >> 16;
- const uint16_t* src = src_ptr + yi * src_stride;
+ const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
// Allocate 2 row buffers.
- const int kRowSize = (dst_width + 31) & ~31;
- align_buffer_64(row, kRowSize * 4);
-
- uint16_t* rowptr = (uint16_t*)row;
- int rowstride = kRowSize;
+ const int row_size = (dst_width + 31) & ~31;
+ align_buffer_64(row, row_size * 4);
+ int rowstride = row_size;
int lasty = yi;
+ uint16_t* rowptr = (uint16_t*)row;
+ if (!row)
+ return 1;
ScaleFilterCols(rowptr, src, dst_width, x, dx);
if (src_height > 1) {
src += src_stride;
}
ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
+ if (src_height > 2) {
+ src += src_stride;
+ }
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
@@ -1432,14 +1918,16 @@ void ScalePlaneBilinearUp_16(int src_width,
if (y > max_y) {
y = max_y;
yi = y >> 16;
- src = src_ptr + yi * src_stride;
+ src = src_ptr + yi * (int64_t)src_stride;
}
if (yi != lasty) {
ScaleFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
- src += src_stride;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
}
}
if (filtering == kFilterLinear) {
@@ -1453,6 +1941,7 @@ void ScalePlaneBilinearUp_16(int src_width,
}
free_aligned_buffer_64(row);
}
+ return 0;
}
// Scale Plane to/from any dimensions, without interpolation.
@@ -1469,7 +1958,7 @@ static void ScalePlaneSimple(int src_width,
const uint8_t* src_ptr,
uint8_t* dst_ptr) {
int i;
- void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+ void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
int x, int dx) = ScaleCols_C;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1487,15 +1976,11 @@ static void ScalePlaneSimple(int src_width,
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleCols = ScaleColsUp2_MMI;
- }
-#endif
}
for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+ dx);
dst_ptr += dst_stride;
y += dy;
}
@@ -1510,7 +1995,7 @@ static void ScalePlaneSimple_16(int src_width,
const uint16_t* src_ptr,
uint16_t* dst_ptr) {
int i;
- void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+ void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
int x, int dx) = ScaleCols_16_C;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1528,15 +2013,11 @@ static void ScalePlaneSimple_16(int src_width,
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleCols = ScaleColsUp2_16_MMI;
- }
-#endif
}
for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+ dx);
dst_ptr += dst_stride;
y += dy;
}
@@ -1544,17 +2025,16 @@ static void ScalePlaneSimple_16(int src_width,
// Scale a plane.
// This function dispatches to a specialized scaler based on scale factor.
-
LIBYUV_API
-void ScalePlane(const uint8_t* src,
- int src_stride,
- int src_width,
- int src_height,
- uint8_t* dst,
- int dst_stride,
- int dst_width,
- int dst_height,
- enum FilterMode filtering) {
+int ScalePlane(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
@@ -1562,23 +2042,31 @@ void ScalePlane(const uint8_t* src,
// Negative height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
+ src = src + (src_height - 1) * (int64_t)src_stride;
src_stride = -src_stride;
}
-
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
// Straight copy.
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
- return;
+ return 0;
}
if (dst_width == src_width && filtering != kFilterBox) {
- int dy = FixedDiv(src_height, dst_height);
+ int dy = 0;
+ int y = 0;
+ // When scaling down, use the center 2 rows to filter.
+ // When scaling up, last row of destination uses the last 2 source rows.
+ if (dst_height <= src_height) {
+ dy = FixedDiv(src_height, dst_height);
+ y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (src_height > 1 && dst_height > 1) {
+ dy = FixedDiv1(src_height, dst_height);
+ }
// Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst, 0, 0, dy, 1, filtering);
- return;
+ dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
+ return 0;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
// Scale down.
@@ -1586,58 +2074,67 @@ void ScalePlane(const uint8_t* src,
// optimized, 3/4
ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, filtering);
- return;
+ return 0;
}
if (2 * dst_width == src_width && 2 * dst_height == src_height) {
// optimized, 1/2
ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, filtering);
- return;
+ return 0;
}
// 3/8 rounded up for odd sized chroma height.
if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
// optimized, 3/8
ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, filtering);
- return;
+ return 0;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
(filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, filtering);
- return;
+ return 0;
}
}
if (filtering == kFilterBox && dst_height * 2 < src_height) {
- ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst);
- return;
+ return ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ }
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
}
if (filtering && dst_height > src_height) {
- ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
+ return ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
}
if (filtering) {
- ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
+ return ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
}
ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst);
+ return 0;
}
LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
- int src_stride,
- int src_width,
- int src_height,
- uint16_t* dst,
- int dst_stride,
- int dst_width,
- int dst_height,
- enum FilterMode filtering) {
+int ScalePlane_16(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
@@ -1645,23 +2142,34 @@ void ScalePlane_16(const uint16_t* src,
// Negative height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
+ src = src + (src_height - 1) * (int64_t)src_stride;
src_stride = -src_stride;
}
-
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
// Straight copy.
CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
- return;
+ return 0;
}
if (dst_width == src_width && filtering != kFilterBox) {
- int dy = FixedDiv(src_height, dst_height);
- // Arbitrary scale vertically, but unscaled vertically.
+ int dy = 0;
+ int y = 0;
+ // When scaling down, use the center 2 rows to filter.
+ // When scaling up, last row of destination uses the last 2 source rows.
+ if (dst_height <= src_height) {
+ dy = FixedDiv(src_height, dst_height);
+ y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter.
+ // When scaling up, ensure the last row of destination uses the last
+ // source. Avoid divide by zero for dst_height but will do no scaling
+ // later.
+ } else if (src_height > 1 && dst_height > 1) {
+ dy = FixedDiv1(src_height, dst_height);
+ }
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst, 0, 0, dy, 1, filtering);
- return;
+ dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
+ return 0;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
// Scale down.
@@ -1669,46 +2177,93 @@ void ScalePlane_16(const uint16_t* src,
// optimized, 3/4
ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- return;
+ return 0;
}
if (2 * dst_width == src_width && 2 * dst_height == src_height) {
// optimized, 1/2
ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- return;
+ return 0;
}
// 3/8 rounded up for odd sized chroma height.
if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
// optimized, 3/8
ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- return;
+ return 0;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
(filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- return;
+ return 0;
}
}
if (filtering == kFilterBox && dst_height * 2 < src_height) {
- ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst);
- return;
+ return ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ }
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
}
if (filtering && dst_height > src_height) {
- ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
+ return ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
}
if (filtering) {
- ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
+ return ScalePlaneBilinearDown_16(src_width, src_height, dst_width,
+ dst_height, src_stride, dst_stride, src,
+ dst, filtering);
}
ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst);
+ return 0;
+}
+
+LIBYUV_API
+int ScalePlane_12(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * (int64_t)src_stride;
+ src_stride = -src_stride;
+ }
+
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
+ }
+
+ return ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
+ dst_width, dst_height, filtering);
}
// Scale an I420 image.
@@ -1736,19 +2291,27 @@ int I420Scale(const uint8_t* src_y,
int src_halfheight = SUBSAMPLE(src_height, 1, 1);
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
- dst_width, dst_height, filtering);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
- dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
- dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
- return 0;
+ r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return r;
}
LIBYUV_API
@@ -1773,19 +2336,72 @@ int I420Scale_16(const uint16_t* src_y,
int src_halfheight = SUBSAMPLE(src_height, 1, 1);
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
- dst_width, dst_height, filtering);
- ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
- dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
- ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
- dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
- return 0;
+ r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return r;
+}
+
+LIBYUV_API
+int I420Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return r;
}
// Scale an I444 image.
@@ -1809,19 +2425,27 @@ int I444Scale(const uint8_t* src_y,
int dst_width,
int dst_height,
enum FilterMode filtering) {
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
- dst_width, dst_height, filtering);
- ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
- dst_width, dst_height, filtering);
- ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
- dst_width, dst_height, filtering);
- return 0;
+ r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u,
+ dst_stride_u, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v,
+ dst_stride_v, dst_width, dst_height, filtering);
+ return r;
}
LIBYUV_API
@@ -1842,19 +2466,239 @@ int I444Scale_16(const uint16_t* src_y,
int dst_width,
int dst_height,
enum FilterMode filtering) {
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
- dst_width, dst_height, filtering);
- ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
- dst_width, dst_height, filtering);
- ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
- dst_width, dst_height, filtering);
- return 0;
+ r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u,
+ dst_stride_u, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v,
+ dst_stride_v, dst_width, dst_height, filtering);
+ return r;
+}
+
+LIBYUV_API
+int I444Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u,
+ dst_stride_u, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v,
+ dst_stride_v, dst_width, dst_height, filtering);
+ return r;
+}
+
+// Scale an I422 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I422Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+ dst_stride_u, dst_halfwidth, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+ dst_stride_v, dst_halfwidth, dst_height, filtering);
+ return r;
+}
+
+LIBYUV_API
+int I422Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+ dst_stride_u, dst_halfwidth, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+ dst_stride_v, dst_halfwidth, dst_height, filtering);
+ return r;
+}
+
+LIBYUV_API
+int I422Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int r;
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+ dst_stride_u, dst_halfwidth, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+ dst_stride_v, dst_halfwidth, dst_height, filtering);
+ return r;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ int r;
+
+ if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+ dst_stride_y, dst_width, dst_height, filtering);
+ if (r != 0) {
+ return r;
+ }
+ r = UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+ dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+ return r;
}
// Deprecated api
diff --git a/source/scale_any.cc b/source/scale_any.cc
new file mode 100644
index 00000000..f6576874
--- /dev/null
+++ b/source/scale_any.cc
@@ -0,0 +1,1078 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h> // For memset/memcpy
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
+
+// Fixed scale down for odd source width. Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
+ int n = (dst_width - 1) - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r + 1); \
+ }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+ ScaleRowDown2Linear_SSSE3,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+ ScaleUVRowDown2Box_SSSE3,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 3)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+ ScaleUVRowDown2Box_AVX2,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+ ScaleRowDown2Linear_AVX2,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+ ScaleRowDown2Linear_NEON,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2_NEON
+SDANY(ScaleUVRowDown2_Any_NEON,
+ ScaleUVRowDown2_NEON,
+ ScaleUVRowDown2_C,
+ 2,
+ 2,
+ 7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
+SDANY(ScaleUVRowDown2Linear_Any_NEON,
+ ScaleUVRowDown2Linear_NEON,
+ ScaleUVRowDown2Linear_C,
+ 2,
+ 2,
+ 7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+ ScaleUVRowDown2Box_NEON,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 7)
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+ ScaleRowDown2Linear_MSA,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+ ScaleRowDown2Box_MSA,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_LSX
+SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_LSX,
+ ScaleRowDown2Linear_LSX,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_LSX,
+ ScaleRowDown2Box_LSX,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+ ScaleRowDown4Box_SSSE3,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+ ScaleRowDown4Box_AVX2,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+ ScaleRowDown4Box_NEON,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+ ScaleRowDown4Box_MSA,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_LSX
+SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_LSX,
+ ScaleRowDown4Box_LSX,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3,
+ ScaleRowDown34_SSSE3,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+ ScaleRowDown34_0_Box_SSSE3,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+ ScaleRowDown34_1_Box_SSSE3,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON,
+ ScaleRowDown34_NEON,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+ ScaleRowDown34_0_Box_NEON,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+ ScaleRowDown34_1_Box_NEON,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+ ScaleRowDown34_MSA,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+ ScaleRowDown34_0_Box_MSA,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+ ScaleRowDown34_1_Box_MSA,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+#endif
+#ifdef HAS_SCALEROWDOWN34_LSX
+SDANY(ScaleRowDown34_Any_LSX,
+ ScaleRowDown34_LSX,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_0_Box_Any_LSX,
+ ScaleRowDown34_0_Box_LSX,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_1_Box_Any_LSX,
+ ScaleRowDown34_1_Box_LSX,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3,
+ ScaleRowDown38_SSSE3,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+ ScaleRowDown38_3_Box_SSSE3,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+ ScaleRowDown38_2_Box_SSSE3,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON,
+ ScaleRowDown38_NEON,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+ ScaleRowDown38_3_Box_NEON,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+ ScaleRowDown38_2_Box_NEON,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+ ScaleRowDown38_MSA,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+ ScaleRowDown38_3_Box_MSA,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+ ScaleRowDown38_2_Box_MSA,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_LSX
+SDANY(ScaleRowDown38_Any_LSX,
+ ScaleRowDown38_LSX,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_LSX,
+ ScaleRowDown38_3_Box_LSX,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_LSX,
+ ScaleRowDown38_2_Box_LSX,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+ ScaleARGBRowDown2_SSE2,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+ ScaleARGBRowDown2Linear_SSE2,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+ ScaleARGBRowDown2Box_SSE2,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON,
+ ScaleARGBRowDown2_NEON,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+ ScaleARGBRowDown2Linear_NEON,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+ ScaleARGBRowDown2Box_NEON,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+ ScaleARGBRowDown2_MSA,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+ ScaleARGBRowDown2Linear_MSA,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+ ScaleARGBRowDown2Box_MSA,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_LSX
+SDANY(ScaleARGBRowDown2_Any_LSX,
+ ScaleARGBRowDown2_LSX,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_LSX,
+ ScaleARGBRowDown2Linear_LSX,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_LSX,
+ ScaleARGBRowDown2Box_LSX,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+ uint8_t* dst_ptr, int dst_width) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
+ dst_ptr + n * BPP, r); \
+ }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+ ScaleARGBRowDownEven_SSE2,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+ ScaleARGBRowDownEvenBox_SSE2,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+ ScaleARGBRowDownEven_NEON,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+ ScaleARGBRowDownEvenBox_NEON,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+ ScaleARGBRowDownEven_MSA,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+ ScaleARGBRowDownEvenBox_MSA,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX
+SDAANY(ScaleARGBRowDownEven_Any_LSX,
+ ScaleARGBRowDownEven_LSX,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_LSX,
+ ScaleARGBRowDownEvenBox_LSX,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+ ScaleUVRowDownEven_NEON,
+ ScaleUVRowDownEven_C,
+ 2,
+ 3)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down. Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint16_t dst_temp[32]); \
+ SIMD_ALIGNED(uint8_t src_temp[32]); \
+ memset(dst_temp, 0, 32 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(src_temp, dst_temp, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_LSX
+SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15)
+#endif
+#undef SAANY
+
+#else
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+ int n = src_width & ~MASK; \
+ if (n > 0) { \
+ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_LSX
+SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#endif // SASIMDONLY
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_LSX
+CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_LSX
+CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+ ScaleARGBFilterCols_MSA,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_LSX
+CANY(ScaleARGBFilterCols_Any_LSX,
+ ScaleARGBFilterCols_LSX,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#undef CANY
+
+// Scale up horizontally 2 times using linear filter.
+#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ dst_ptr[0] = src_ptr[0]; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(src_ptr, dst_ptr + 1, n); \
+ } \
+ C(src_ptr + (n / 2), dst_ptr + n + 1, r); \
+ } \
+ dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \
+ }
+
+// Even the C versions need to be wrapped, because boundary pixels have to
+// be handled differently
+
+SUH2LANY(ScaleRowUp2_Linear_Any_C,
+ ScaleRowUp2_Linear_C,
+ ScaleRowUp2_Linear_C,
+ 0,
+ uint8_t)
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
+ ScaleRowUp2_Linear_16_C,
+ ScaleRowUp2_Linear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
+ ScaleRowUp2_Linear_SSE2,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
+ ScaleRowUp2_Linear_SSSE3,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
+ ScaleRowUp2_Linear_12_SSSE3,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+ ScaleRowUp2_Linear_16_SSE2,
+ ScaleRowUp2_Linear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
+ ScaleRowUp2_Linear_AVX2,
+ ScaleRowUp2_Linear_C,
+ 31,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
+ ScaleRowUp2_Linear_12_AVX2,
+ ScaleRowUp2_Linear_16_C,
+ 31,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
+ ScaleRowUp2_Linear_16_AVX2,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_NEON
+SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
+ ScaleRowUp2_Linear_NEON,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
+SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,
+ ScaleRowUp2_Linear_12_NEON,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
+SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
+ ScaleRowUp2_Linear_16_NEON,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#undef SUH2LANY
+
+// Scale up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+ ptrdiff_t dst_stride, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ const PTYPE* sa = src_ptr; \
+ const PTYPE* sb = src_ptr + src_stride; \
+ PTYPE* da = dst_ptr; \
+ PTYPE* db = dst_ptr + dst_stride; \
+ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
+ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(sa, sb - sa, da + 1, db - da, n); \
+ } \
+ C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
+ } \
+ da[dst_width - 1] = \
+ (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
+ db[dst_width - 1] = \
+ (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
+ }
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
+ ScaleRowUp2_Bilinear_C,
+ ScaleRowUp2_Bilinear_C,
+ 0,
+ uint8_t)
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
+ ScaleRowUp2_Bilinear_16_C,
+ ScaleRowUp2_Bilinear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
+ ScaleRowUp2_Bilinear_SSE2,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
+ ScaleRowUp2_Bilinear_12_SSSE3,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
+ ScaleRowUp2_Bilinear_16_SSE2,
+ ScaleRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
+ ScaleRowUp2_Bilinear_SSSE3,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
+ ScaleRowUp2_Bilinear_AVX2,
+ ScaleRowUp2_Bilinear_C,
+ 31,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
+ ScaleRowUp2_Bilinear_12_AVX2,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
+ ScaleRowUp2_Bilinear_16_AVX2,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
+ ScaleRowUp2_Bilinear_NEON,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON,
+ ScaleRowUp2_Bilinear_12_NEON,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
+ ScaleRowUp2_Bilinear_16_NEON,
+ ScaleRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#undef SU2BLANY
+
+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ dst_ptr[0] = src_ptr[0]; \
+ dst_ptr[1] = src_ptr[1]; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(src_ptr, dst_ptr + 2, n); \
+ } \
+ C(src_ptr + n, dst_ptr + 2 * n + 2, r); \
+ } \
+ dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+ dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+ }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+ ScaleUVRowUp2_Linear_C,
+ ScaleUVRowUp2_Linear_C,
+ 0,
+ uint8_t)
+
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
+ ScaleUVRowUp2_Linear_16_C,
+ ScaleUVRowUp2_Linear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+ ScaleUVRowUp2_Linear_SSSE3,
+ ScaleUVRowUp2_Linear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+ ScaleUVRowUp2_Linear_AVX2,
+ ScaleUVRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
+ ScaleUVRowUp2_Linear_16_SSE41,
+ ScaleUVRowUp2_Linear_16_C,
+ 3,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
+ ScaleUVRowUp2_Linear_16_AVX2,
+ ScaleUVRowUp2_Linear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
+ ScaleUVRowUp2_Linear_NEON,
+ ScaleUVRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
+ ScaleUVRowUp2_Linear_16_NEON,
+ ScaleUVRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+ ptrdiff_t dst_stride, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ const PTYPE* sa = src_ptr; \
+ const PTYPE* sb = src_ptr + src_stride; \
+ PTYPE* da = dst_ptr; \
+ PTYPE* db = dst_ptr + dst_stride; \
+ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
+ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
+ da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \
+ db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(sa, sb - sa, da + 2, db - da, n); \
+ } \
+ C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
+ } \
+ da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
+ sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+ 2; \
+ db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
+ 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+ 2; \
+ da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
+ sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+ 2; \
+ db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
+ 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+ 2; \
+ }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+ ScaleUVRowUp2_Bilinear_C,
+ ScaleUVRowUp2_Bilinear_C,
+ 0,
+ uint8_t)
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
+ ScaleUVRowUp2_Bilinear_16_C,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+ ScaleUVRowUp2_Bilinear_SSSE3,
+ ScaleUVRowUp2_Bilinear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+ ScaleUVRowUp2_Bilinear_AVX2,
+ ScaleUVRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
+ ScaleUVRowUp2_Bilinear_16_SSE41,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
+ ScaleUVRowUp2_Bilinear_16_AVX2,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
+ ScaleUVRowUp2_Bilinear_NEON,
+ ScaleUVRowUp2_Bilinear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
+ ScaleUVRowUp2_Bilinear_16_NEON,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#undef SBU2BLANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_argb.cc b/source/scale_argb.cc
index beef380a..18bdeb86 100644
--- a/files/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -16,6 +16,7 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" // For CopyARGB
#include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
@@ -58,9 +59,9 @@ static void ScaleARGBDown2(int src_width,
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row, even column.
if (filtering == kFilterBilinear) {
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
} else {
- src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4;
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@@ -111,22 +112,31 @@ static void ScaleARGBDown2(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWN2_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEARGBROWDOWN2_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
ScaleARGBRowDown2 =
filtering == kFilterNone
- ? ScaleARGBRowDown2_Any_MMI
- : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
- : ScaleARGBRowDown2Box_Any_MMI);
- if (IS_ALIGNED(dst_width, 2)) {
+ ? ScaleARGBRowDown2_Any_LSX
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_LSX
+ : ScaleARGBRowDown2Box_Any_LSX);
+ if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 =
filtering == kFilterNone
- ? ScaleARGBRowDown2_MMI
- : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
- : ScaleARGBRowDown2Box_MMI);
+ ? ScaleARGBRowDown2_LSX
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_LSX
+ : ScaleARGBRowDown2Box_LSX);
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_RVV
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_RVV
+ : ScaleARGBRowDown2Box_RVV);
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -141,28 +151,33 @@ static void ScaleARGBDown2(int src_width,
// ScaleARGB ARGB, 1/4
// This is an optimized version for scaling down a ARGB to 1/4 of
// its original size.
-static void ScaleARGBDown4Box(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint8_t* src_argb,
- uint8_t* dst_argb,
- int x,
- int dx,
- int y,
- int dy) {
+static int ScaleARGBDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
int j;
// Allocate 2 rows of ARGB.
- const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+ const int row_size = (dst_width * 2 * 4 + 31) & ~31;
+ // TODO(fbarchard): Remove this row buffer and implement a ScaleARGBRowDown4
+ // but implemented via a 2 pass wrapper that uses a very small array on the
+ // stack with a horizontal loop.
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
uint8_t* dst_argb, int dst_width) =
ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
(void)src_width;
(void)src_height;
(void)dx;
@@ -184,16 +199,22 @@ static void ScaleARGBDown4Box(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_RVV;
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
- ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + row_size,
dst_width * 2);
- ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+ ScaleARGBRowDown2(row, row_size, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
}
free_aligned_buffer_64(row);
+ return 0;
}
// ScaleARGB ARGB Even
@@ -214,7 +235,7 @@ static void ScaleARGBDownEven(int src_width,
enum FilterMode filtering) {
int j;
int col_step = dx >> 16;
- int row_stride = (dy >> 16) * src_stride;
+ ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
int src_step, uint8_t* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@@ -222,7 +243,7 @@ static void ScaleARGBDownEven(int src_width,
(void)src_height;
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@@ -253,16 +274,26 @@ static void ScaleARGBDownEven(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
- : ScaleARGBRowDownEven_Any_MMI;
- if (IS_ALIGNED(dst_width, 2)) {
+#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_LSX
+ : ScaleARGBRowDownEven_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven =
- filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+ filtering ? ScaleARGBRowDownEvenBox_LSX : ScaleARGBRowDownEven_LSX;
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVENBOX_RVV)
+ if (filtering && TestCpuFlag(kCpuHasRVV)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEvenBox_RVV;
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+ if (!filtering && TestCpuFlag(kCpuHasRVV)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -275,24 +306,24 @@ static void ScaleARGBDownEven(int src_width,
}
// Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint8_t* src_argb,
- uint8_t* dst_argb,
- int x,
- int dx,
- int y,
- int dy,
- enum FilterMode filtering) {
+static int ScaleARGBBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
int j;
- void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
@@ -340,6 +371,19 @@ static void ScaleARGBBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -361,10 +405,20 @@ static void ScaleARGBBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+ }
+ }
+#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
{
align_buffer_64(row, clip_src_width * 4);
+ if (!row)
+ return 1;
const int max_y = (src_height - 1) << 16;
if (y > max_y) {
@@ -372,7 +426,7 @@ static void ScaleARGBBilinearDown(int src_width,
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint8_t* src = src_argb + yi * src_stride;
+ const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
if (filtering == kFilterLinear) {
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
} else {
@@ -388,27 +442,28 @@ static void ScaleARGBBilinearDown(int src_width,
}
free_aligned_buffer_64(row);
}
+ return 0;
}
// Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride,
- int dst_stride,
- const uint8_t* src_argb,
- uint8_t* dst_argb,
- int x,
- int dx,
- int y,
- int dy,
- enum FilterMode filtering) {
+static int ScaleARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
int j;
- void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
@@ -444,14 +499,19 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(dst_width, 2)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
if (src_width >= 32768) {
ScaleARGBFilterCols =
filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
@@ -477,6 +537,14 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+ if (filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -498,11 +566,11 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
- if (!filtering && TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
- if (IS_ALIGNED(dst_width, 1)) {
- ScaleARGBFilterCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+ if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_LSX;
}
}
#endif
@@ -513,11 +581,6 @@ static void ScaleARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
- }
-#endif
}
if (y > max_y) {
@@ -526,14 +589,16 @@ static void ScaleARGBBilinearUp(int src_width,
{
int yi = y >> 16;
- const uint8_t* src = src_argb + yi * src_stride;
+ const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
// Allocate 2 rows of ARGB.
- const int kRowSize = (dst_width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+ const int row_size = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
uint8_t* rowptr = row;
- int rowstride = kRowSize;
+ int rowstride = row_size;
int lasty = yi;
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -541,7 +606,9 @@ static void ScaleARGBBilinearUp(int src_width,
src += src_stride;
}
ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
+ if (src_height > 2) {
+ src += src_stride;
+ }
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
@@ -549,14 +616,16 @@ static void ScaleARGBBilinearUp(int src_width,
if (y > max_y) {
y = max_y;
yi = y >> 16;
- src = src_argb + yi * src_stride;
+ src = src_argb + yi * (intptr_t)src_stride;
}
if (yi != lasty) {
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
- src += src_stride;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
}
}
if (filtering == kFilterLinear) {
@@ -570,27 +639,28 @@ static void ScaleARGBBilinearUp(int src_width,
}
free_aligned_buffer_64(row);
}
+ return 0;
}
#ifdef YUVSCALEUP
// Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- int src_stride_y,
- int src_stride_u,
- int src_stride_v,
- int dst_stride_argb,
- const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_argb,
- int x,
- int dx,
- int y,
- int dy,
- enum FilterMode filtering) {
+static int ScaleYUVToARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
int j;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
@@ -611,6 +681,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(src_width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
@@ -627,8 +706,29 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(src_width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(src_width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I422ToARGBRow = I422ToARGBRow_RVV;
+ }
+#endif
- void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -663,8 +763,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
- void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
if (src_width >= 32768) {
@@ -692,6 +805,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+ if (filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -713,11 +834,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
- if (!filtering && TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
- if (IS_ALIGNED(dst_width, 1)) {
- ScaleARGBFilterCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+ if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_LSX;
}
}
#endif
@@ -728,11 +849,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
- }
-#endif
}
const int max_y = (src_height - 1) << 16;
@@ -742,20 +858,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
int yi = y >> 16;
int uv_yi = yi >> kYShift;
- const uint8_t* src_row_y = src_y + yi * src_stride_y;
- const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
- const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
-
- // Allocate 2 rows of ARGB.
- const int kRowSize = (dst_width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+ const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y;
+ const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+ const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
- // Allocate 1 row of ARGB for source conversion.
- align_buffer_64(argb_row, src_width * 4);
+ // Allocate 1 row of ARGB for source conversion and 2 rows of ARGB
+ // scaled horizontally to the destination width.
+ const int row_size = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, row_size * 2 + src_width * 4);
+ uint8_t* argb_row = row + row_size * 2;
uint8_t* rowptr = row;
- int rowstride = kRowSize;
+ int rowstride = row_size;
int lasty = yi;
+ if (!row)
+ return 1;
// TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
@@ -782,9 +899,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
y = max_y;
yi = y >> 16;
uv_yi = yi >> kYShift;
- src_row_y = src_y + yi * src_stride_y;
- src_row_u = src_u + uv_yi * src_stride_u;
- src_row_v = src_v + uv_yi * src_stride_v;
+ src_row_y = src_y + yi * (intptr_t)src_stride_y;
+ src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+ src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
}
if (yi != lasty) {
// TODO(fbarchard): Convert the clipped region of row.
@@ -810,7 +927,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
y += dy;
}
free_aligned_buffer_64(row);
- free_aligned_buffer_64(row_argb);
+ return 0;
}
#endif
@@ -832,7 +949,7 @@ static void ScaleARGBSimple(int src_width,
int y,
int dy) {
int j;
- void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
(void)src_height;
@@ -857,11 +974,11 @@ static void ScaleARGBSimple(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBCols = ScaleARGBCols_Any_MMI;
- if (IS_ALIGNED(dst_width, 1)) {
- ScaleARGBCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBCols = ScaleARGBCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBCols_LSX;
}
}
#endif
@@ -872,16 +989,11 @@ static void ScaleARGBSimple(int src_width,
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleARGBCols = ScaleARGBColsUp2_MMI;
- }
-#endif
}
for (j = 0; j < dst_height; ++j) {
- ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
- dx);
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride,
+ dst_width, x, dx);
dst_argb += dst_stride;
y += dy;
}
@@ -890,19 +1002,19 @@ static void ScaleARGBSimple(int src_width,
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8_t* src,
- int src_stride,
- int src_width,
- int src_height,
- uint8_t* dst,
- int dst_stride,
- int dst_width,
- int dst_height,
- int clip_x,
- int clip_y,
- int clip_width,
- int clip_height,
- enum FilterMode filtering) {
+static int ScaleARGB(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
@@ -916,7 +1028,7 @@ static void ScaleARGB(const uint8_t* src,
// Negative src_height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
+ src = src + (src_height - 1) * (intptr_t)src_stride;
src_stride = -src_stride;
}
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -931,7 +1043,7 @@ static void ScaleARGB(const uint8_t* src,
if (clip_y) {
int64_t clipf = (int64_t)(clip_y)*dy;
y += (clipf & 0xffff);
- src += (clipf >> 16) * src_stride;
+ src += (clipf >> 16) * (intptr_t)src_stride;
dst += clip_y * dst_stride;
}
@@ -947,51 +1059,50 @@ static void ScaleARGB(const uint8_t* src,
ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
- return;
+ return 0;
}
if (dx == 0x40000 && filtering == kFilterBox) {
// Optimized 1/4 box downsample.
- ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
- src_stride, dst_stride, src, dst, x, dx, y, dy);
- return;
+ return ScaleARGBDown4Box(src_width, src_height, clip_width,
+ clip_height, src_stride, dst_stride, src,
+ dst, x, dx, y, dy);
}
ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
- return;
+ return 0;
}
// Optimized odd scale down. ie 3, 5, 7, 9x.
if ((dx & 0x10000) && (dy & 0x10000)) {
filtering = kFilterNone;
if (dx == 0x10000 && dy == 0x10000) {
// Straight copy.
- ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
- dst, dst_stride, clip_width, clip_height);
- return;
+ ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4,
+ src_stride, dst, dst_stride, clip_width, clip_height);
+ return 0;
}
}
}
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
- // Arbitrary scale vertically, but unscaled vertically.
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
- dst_stride, src, dst, x, y, dy, 4, filtering);
- return;
+ dst_stride, src, dst, x, y, dy, /*bpp=*/4, filtering);
+ return 0;
}
if (filtering && dy < 65536) {
- ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
- src_stride, dst_stride, src, dst, x, dx, y, dy,
- filtering);
- return;
+ return ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
}
if (filtering) {
- ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
- src_stride, dst_stride, src, dst, x, dx, y, dy,
- filtering);
- return;
+ return ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
}
ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, dx, y, dy);
+ return 0;
}
LIBYUV_API
@@ -1015,10 +1126,9 @@ int ARGBScaleClip(const uint8_t* src_argb,
(clip_y + clip_height) > dst_height) {
return -1;
}
- ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
- dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
- clip_height, filtering);
- return 0;
+ return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+ clip_width, clip_height, filtering);
}
// Scale an ARGB image.
@@ -1036,10 +1146,9 @@ int ARGBScale(const uint8_t* src_argb,
src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
- dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
- filtering);
- return 0;
+ return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, 0, 0, dst_width,
+ dst_height, filtering);
}
// Scale with YUV conversion to ARGB and clipping.
@@ -1063,8 +1172,11 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
- uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
int r;
+ uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
+ if (!argb_buffer) {
+ return 1; // Out of memory runtime error.
+ }
(void)src_fourcc; // TODO(fbarchard): implement and/or assert.
(void)dst_fourcc;
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
diff --git a/files/source/scale_common.cc b/source/scale_common.cc
index 63690271..d07a39af 100644
--- a/files/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -23,6 +23,25 @@ namespace libyuv {
extern "C" {
#endif
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+ return (-(v >= 255) | v) & 255;
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
@@ -62,6 +81,50 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
}
}
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale) {
+ int x;
+ (void)src_stride;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+ dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+ }
+}
+
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale) {
+ int x;
+ (void)src_stride;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+ dst_width -= 1;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+ dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+ dst += 1;
+ src_ptr += 2;
+ }
+ dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[0], scale));
+}
+
void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -98,6 +161,52 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
}
}
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale) {
+ const uint16_t* s = src_ptr;
+ int x;
+ (void)src_stride;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+ dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+ }
+}
+
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale) {
+ const uint16_t* s = src_ptr;
+ int x;
+ (void)src_stride;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+ dst_width -= 1;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+ dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+ dst += 1;
+ s += 2;
+ }
+ dst[0] = STATIC_CAST(uint8_t, C16TO8(s[0], scale));
+}
+
void ScaleRowDown2Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -160,6 +269,61 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
}
}
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ int x;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = STATIC_CAST(uint8_t,
+ C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+ dst[1] = STATIC_CAST(uint8_t,
+ C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = STATIC_CAST(uint8_t,
+ C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+ }
+}
+
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width,
+ int scale) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ int x;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+ dst_width -= 1;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = STATIC_CAST(uint8_t,
+ C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+ dst[1] = STATIC_CAST(uint8_t,
+ C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = STATIC_CAST(uint8_t,
+ C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+ dst += 1;
+ s += 2;
+ t += 2;
+ }
+ dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + t[0] + 1) >> 1, scale));
+}
+
void ScaleRowDown4_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -400,6 +564,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
}
}
+// Sample position: (O is src sample position, X is dst sample position)
+//
+// v dst_ptr at here v stop at here
+// X O X X O X X O X X O X X O X
+// ^ src_ptr at here
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+ dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+ }
+}
+
+// Sample position: (O is src sample position, X is dst sample position)
+//
+// src_ptr at here
+// X v X X X X X X X X X
+// O O O O O
+// X X X X X X X X X X
+// ^ dst_ptr at here ^ stop at here
+// X X X X X X X X X X
+// O O O O O
+// X X X X X X X X X X
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[2 * x + 0] =
+ (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+ d[2 * x + 1] =
+ (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 0] =
+ (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 1] =
+ (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+ }
+}
+
+// Only suitable for at most 14 bit range.
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+ dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+ }
+}
+
+// Only suitable for at most 12bit range.
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ uint16_t* d = dst_ptr;
+ uint16_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[2 * x + 0] =
+ (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+ d[2 * x + 1] =
+ (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 0] =
+ (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 1] =
+ (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+ }
+}
+
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -677,18 +930,18 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >>
+ (65536u / 9u) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >>
+ (65536u / 9u) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >>
+ (65536u / 6u) >>
16;
src_ptr += 8;
dst_ptr += 3;
@@ -731,15 +984,15 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
- (65536 / 6) >>
+ (65536u / 6u) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
- (65536 / 6) >>
+ (65536u / 6u) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >>
+ (65536u / 4u) >>
16;
src_ptr += 8;
dst_ptr += 3;
@@ -776,6 +1029,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr,
}
}
+// ARGB scale row functions
+
void ScaleARGBRowDown2_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@@ -1018,6 +1273,346 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
#undef BLENDERC
#undef BLENDER
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = src_uv[2]; // Store the 2nd UV
+ dst_uv[1] = src_uv[3];
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += src_stepx * 2;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[4 * x + 0] =
+ (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 1] =
+ (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 2] =
+ (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+ dst_ptr[4 * x + 3] =
+ (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+ }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 1 + 8) >>
+ 4;
+ d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 1 + 8) >>
+ 4;
+ d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 9 + 8) >>
+ 4;
+ e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 9 + 8) >>
+ 4;
+ }
+}
+
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[4 * x + 0] =
+ (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 1] =
+ (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 2] =
+ (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+ dst_ptr[4 * x + 3] =
+ (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+ }
+}
+
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ uint16_t* d = dst_ptr;
+ uint16_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 1 + 8) >>
+ 4;
+ d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 1 + 8) >>
+ 4;
+ d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 9 + 8) >>
+ 4;
+ e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 9 + 8) >>
+ 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width,
@@ -1029,11 +1624,11 @@ void ScalePlaneVertical(int src_height,
int x,
int y,
int dy,
- int bpp,
+ int bpp, // bytes per pixel. 4 for ARGB.
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher bpp.
int dst_width_bytes = dst_width * bpp;
- void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1075,14 +1670,20 @@ void ScalePlaneVertical(int src_height,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(dst_width_bytes, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
+
for (j = 0; j < dst_height; ++j) {
int yi;
int yf;
@@ -1097,6 +1698,7 @@ void ScalePlaneVertical(int src_height,
y += dy;
}
}
+
void ScalePlaneVertical_16(int src_height,
int dst_width,
int dst_height,
@@ -1107,11 +1709,11 @@ void ScalePlaneVertical_16(int src_height,
int x,
int y,
int dy,
- int wpp,
+ int wpp, /* words per pixel. normally 1 */
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
- void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+ void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1123,32 +1725,32 @@ void ScalePlaneVertical_16(int src_height,
src_argb += (x >> 16) * wpp;
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_Any_SSE2;
+ if (IS_ALIGNED(dst_width_words, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_words, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
- if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_16_Any_AVX2;
+ if (IS_ALIGNED(dst_width_words, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_Any_NEON;
+ if (IS_ALIGNED(dst_width_words, 8)) {
InterpolateRow = InterpolateRow_16_NEON;
}
}
@@ -1168,6 +1770,70 @@ void ScalePlaneVertical_16(int src_height,
}
}
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+void ScalePlaneVertical_16To8(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp, /* words per pixel. normally 1 */
+ int scale,
+ enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher wpp.
+ int dst_width_words = dst_width * wpp;
+ // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
+ void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb,
+ ptrdiff_t src_stride, int scale, int dst_width,
+ int source_y_fraction) = InterpolateRow_16To8_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(wpp >= 1 && wpp <= 2);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * wpp;
+
+#if defined(HAS_INTERPOLATEROW_16TO8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_AVX2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride,
+ scale, dst_width_words, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width,
int src_height,
@@ -1181,8 +1847,8 @@ enum FilterMode ScaleFilterReduce(int src_width,
src_height = -src_height;
}
if (filtering == kFilterBox) {
- // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
- if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
filtering = kFilterBilinear;
}
}
@@ -1217,7 +1883,7 @@ int FixedDiv_C(int num, int div) {
return (int)(((int64_t)(num) << 16) / div);
}
-// Divide num by div and return as 16.16 fixed point result.
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div) {
return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
}
@@ -1260,14 +1926,14 @@ void ScaleSlope(int src_width,
if (dst_width <= Abs(src_width)) {
*dx = FixedDiv(Abs(src_width), dst_width);
*x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_width > 1) {
+ } else if (src_width > 1 && dst_width > 1) {
*dx = FixedDiv1(Abs(src_width), dst_width);
*x = 0;
}
if (dst_height <= src_height) {
*dy = FixedDiv(src_height, dst_height);
*y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_height > 1) {
+ } else if (src_height > 1 && dst_height > 1) {
*dy = FixedDiv1(src_height, dst_height);
*y = 0;
}
@@ -1276,7 +1942,7 @@ void ScaleSlope(int src_width,
if (dst_width <= Abs(src_width)) {
*dx = FixedDiv(Abs(src_width), dst_width);
*x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_width > 1) {
+ } else if (src_width > 1 && dst_width > 1) {
*dx = FixedDiv1(Abs(src_width), dst_width);
*x = 0;
}
@@ -1298,35 +1964,6 @@ void ScaleSlope(int src_width,
}
#undef CENTERSTART
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_C(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width) {
- const uint16_t* src2 = src_ptr + src_stride;
-
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- uint16_t p0 = src_ptr[0];
- uint16_t p1 = src_ptr[1];
- uint16_t p2 = src2[0];
- uint16_t p3 = src2[1];
- dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
- dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
- ++src_ptr;
- ++src2;
- dst += 2;
- }
- if (dst_width & 1) {
- uint16_t p0 = src_ptr[0];
- uint16_t p1 = src_ptr[1];
- uint16_t p2 = src2[0];
- uint16_t p3 = src2[1];
- dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
- }
-}
-
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
new file mode 100644
index 00000000..17eeffad
--- /dev/null
+++ b/source/scale_gcc.cc
@@ -0,0 +1,2953 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
+ 8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ intptr_t stridex3;
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "=&r"(stridex3) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(src_stride * 3)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6");
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
+ 10, 11, 8, 9, 14, 15, 12, 13};
+
+static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
+ 3, 1, 1, 3, 3, 1, 1, 3};
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $1,%%xmm6 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm1 \n" // 01234567
+ "movq 1(%0),%%xmm2 \n" // 12345678
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "paddw %%xmm6,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
+ "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
+
+ "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ LABELALIGN
+ "1: \n"
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ // above line
+ "movq (%0),%%xmm1 \n" // 01234567
+ "movq 1(%0),%%xmm2 \n" // 12345678
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n" // near+far
+ "movdqa %%xmm3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n" // 2*near
+ "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
+
+ "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ // below line
+ "movq (%0,%3),%%xmm6 \n" // 01234567
+ "movq 1(%0,%3),%%xmm2 \n" // 12345678
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+
+ "movdqa %%xmm6,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm7 \n"
+ "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
+ "paddw %%xmm7,%%xmm5 \n" // near+far
+ "movdqa %%xmm3,%%xmm7 \n"
+ "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
+ "paddw %%xmm7,%%xmm7 \n" // 2*near
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
+
+ "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm6,%%xmm2 \n" // near+far
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
+
+ // xmm4 xmm1
+ // xmm5 xmm2
+ "pcmpeqw %%xmm0,%%xmm0 \n"
+ "psrlw $15,%%xmm0 \n"
+ "psllw $3,%%xmm0 \n" // all 8
+
+ "movdqa %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+
+ "movdqa %%xmm1,%%xmm7 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm7 \n" // ^ div by 16
+
+ "packuswb %%xmm7,%%xmm3 \n"
+ "movdqu %%xmm3,(%1) \n" // save above line
+
+ "movdqa %%xmm5,%%xmm3 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16
+
+ "movdqa %%xmm2,%%xmm3 \n"
+ "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
+ "psrlw $4,%%xmm2 \n" // ^ div by 16
+
+ "packuswb %%xmm2,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // save below line
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %3,%%xmm5 \n"
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 01234567 (16)
+ "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
+
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
+ "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
+
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far)
+ "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far)
+
+ "paddw %%xmm4,%%xmm1 \n" // far+2
+ "paddw %%xmm4,%%xmm3 \n" // far+2
+ "paddw %%xmm0,%%xmm1 \n" // near+far+2
+ "paddw %%xmm2,%%xmm3 \n" // near+far+2
+ "paddw %%xmm0,%%xmm0 \n" // 2*near
+ "paddw %%xmm2,%%xmm2 \n" // 2*near
+ "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi)
+
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,16(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearShuffleFar) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n"
+ "psllw $3,%%xmm7 \n" // all 8
+ "movdqa %5,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ // above line
+ "movdqu (%0),%%xmm0 \n" // 01234567 (16)
+ "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
+ "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far)
+ "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far)
+ "paddw %%xmm0,%%xmm1 \n" // near+far
+ "paddw %%xmm2,%%xmm3 \n" // near+far
+ "paddw %%xmm0,%%xmm0 \n" // 2*near
+ "paddw %%xmm2,%%xmm2 \n" // 2*near
+ "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi)
+
+ // below line
+ "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16)
+ "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16)
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16)
+ "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16)
+ "movdqa %%xmm3,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far)
+ "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far)
+ "paddw %%xmm1,%%xmm4 \n" // near+far
+ "paddw %%xmm3,%%xmm5 \n" // near+far
+ "paddw %%xmm1,%%xmm1 \n" // 2*near
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo)
+ "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16
+ "movdqu %%xmm4,(%1) \n"
+
+ "movdqa %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16
+ "movdqu %%xmm4,0x10(%1) \n"
+
+ "movdqa %%xmm1,%%xmm4 \n"
+ "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16
+ "movdqu %%xmm1,(%1,%4,2) \n"
+
+ "movdqa %%xmm3,%%xmm4 \n"
+ "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+ "movdqu %%xmm3,0x10(%1,%4,2) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearShuffleFar) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqd %%xmm4,%%xmm4 \n"
+ "psrld $31,%%xmm4 \n"
+ "pslld $1,%%xmm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0123 (16b)
+ "movq 2(%0),%%xmm1 \n" // 1234 (16b)
+
+ "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b)
+ "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b)
+
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+
+ "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
+ "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
+
+ "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
+ "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+
+ "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+ "packssdw %%xmm1,%%xmm0 \n"
+ "pshufd $0b11011000,%%xmm0,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm7,%%xmm7 \n"
+ "pcmpeqd %%xmm6,%%xmm6 \n"
+ "psrld $31,%%xmm6 \n"
+ "pslld $3,%%xmm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
+ "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
+ "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ "movq (%0),%%xmm0 \n" // 0123 (16b)
+ "movq 2(%0),%%xmm1 \n" // 1234 (16b)
+ "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b)
+ "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
+ "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
+ "paddd %%xmm0,%%xmm2 \n" // near+far (lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far (hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ "movq (%0,%3,2),%%xmm2 \n"
+ "movq 2(%0,%3,2),%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b)
+ "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b)
+ "movdqa %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far)
+ "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far)
+ "paddd %%xmm2,%%xmm4 \n" // near+far (lo)
+ "paddd %%xmm3,%%xmm5 \n" // near+far (hi)
+ "paddd %%xmm2,%%xmm2 \n" // 2*near (lo)
+ "paddd %%xmm3,%%xmm3 \n" // 2*near (hi)
+ "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
+ "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
+ "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
+ "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
+
+ "packssdw %%xmm0,%%xmm4 \n"
+ "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packssdw %%xmm2,%%xmm5 \n"
+ "pshufd $0b11011000,%%xmm5,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4,2) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqa %3,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 1(%0,%3),%%xmm4 \n"
+ "punpcklwd %%xmm1,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm4,%%xmm3 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vbroadcastf128 %3,%%ymm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vbroadcastf128 %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+
+ // ymm0 ymm1
+ // ymm2 ymm3
+
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm5 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
+ "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
+
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
+
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
+ "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
+ "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
+
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
+ "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
+ "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
+ "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
+ "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
+ "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
+
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
+ "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,32(%1) \n"
+
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearShuffleFar) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
+ "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
+ "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
+ "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
+ "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
+
+ "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
+ "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
+ "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
+ "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
+ "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
+ "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1) \n" // store above
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
+ "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
+ "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearShuffleFar) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $31,%%ymm4,%%ymm4 \n"
+ "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
+
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+
+ "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
+ "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
+
+ "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
+ "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
+
+ "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrld $31,%%ymm6,%%ymm6 \n"
+ "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
+ "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
+
+ "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
+ "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
+ "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
+ "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
+ "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
+ "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
+ "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
+ "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
+
+ "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
+ "vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
+ "vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile("pxor %%xmm5,%%xmm5 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1, temp_pixel;
+ asm volatile(
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ // 1
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "=&a"(temp_pixel), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1), // %4
+#if defined(__x86_64__)
+ "+rm"(dst_width) // %5
+#else
+ "+m"(dst_width) // %5
+#endif
+ : "rm"(x), // %6
+ "rm"(dx), // %7
+#if defined(__x86_64__)
+ "x"(kFsub80), // %8
+ "x"(kFadd40) // %9
+#else
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
+#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ (void)src_stride;
+ asm volatile(
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "=&r"(src_stepx_x12) // %4
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ intptr_t row1 = (intptr_t)(src_stride);
+ asm volatile(
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "=&r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1;
+ asm volatile(
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ LABELALIGN
+ "40: \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "99: \n"
+ : "=&a"(x0), // %0
+ "=&d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1;
+ asm volatile(
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile(
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+
+ LABELALIGN
+ "99: \n" // clang-format error.
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
+ return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
+ return num;
+}
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
+ defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+ 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
+ 6u, 14u, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80};
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5, %%xmm5 \n" // zero
+ "movdqa %4,%%xmm1 \n" // split shuffler
+ "movdqa %5,%%xmm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 8 UV row 0
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
+ "pshufb %%xmm1,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n" // vertical add
+ "psrlw $0x1,%%xmm0 \n" // round
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n" // merge uv
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n" // 4 UV
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
+ "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
+ "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
+ "lea 0x20(%0),%0 \n"
+ "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
+ "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n" // 8 UV
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
+
+static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
+ 3, 1, 3, 1, 1, 3, 1, 3};
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqa %3,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
+ "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
+ "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
+ "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kUVLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
+ "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
+ "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
+ "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 2(%0,%3),%%xmm4 \n"
+ "punpcklbw %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm1,%%xmm3 \n"
+ "punpckldq %%xmm1,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kUVLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vbroadcastf128 %3,%%ymm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n"
+ "vmovdqu 2(%0),%%xmm1 \n"
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kUVLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vbroadcastf128 %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n"
+ "vmovdqu 2(%0),%%xmm1 \n"
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+
+ // ymm0 ymm1
+ // ymm2 ymm3
+
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kUVLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqd %%xmm4,%%xmm4 \n"
+ "psrld $31,%%xmm4 \n"
+ "pslld $1,%%xmm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
+ "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
+
+ "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v)
+ "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v)
+
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+
+ "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far)
+ "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far)
+
+ "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
+ "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+
+ "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+ "packusdw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm7,%%xmm7 \n"
+ "pcmpeqd %%xmm6,%%xmm6 \n"
+ "psrld $31,%%xmm6 \n"
+ "pslld $3,%%xmm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
+ "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
+ "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ "movq (%0,%3,2),%%xmm2 \n"
+ "movq 4(%0,%3,2),%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo)
+ "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi)
+ "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo)
+ "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi)
+ "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo)
+ "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi)
+ "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
+ "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
+ "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
+ "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
+
+ "packusdw %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packusdw %%xmm2,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4,2) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $31,%%ymm4,%%ymm4 \n"
+ "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
+ "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
+
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+
+ "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
+ "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
+
+ "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
+ "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
+
+ "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrld $31,%%ymm6,%%ymm6 \n"
+ "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
+ "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
+ "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
+
+ "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
+ "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
+ "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
+ "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
+ "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
+ "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
+ "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
+ "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
+
+ "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/scale_lsx.cc b/source/scale_lsx.cc
new file mode 100644
index 00000000..bfe5e9fb
--- /dev/null
+++ b/source/scale_lsx.cc
@@ -0,0 +1,739 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define LOAD_DATA(_src, _in, _out) \
+ { \
+ int _tmp1, _tmp2, _tmp3, _tmp4; \
+ DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \
+ _tmp2, _tmp3, _tmp4); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \
+ }
+
+void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ (void)src_stride;
+ __m128i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ dst0 = __lsx_vpickod_w(src1, src0);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ (void)src_stride;
+ __m128i src0, src1, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_w(src1, src0);
+ tmp1 = __lsx_vpickod_w(src1, src0);
+ dst0 = __lsx_vavgr_bu(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ const uint8_t* s = src_argb;
+ const uint8_t* t = src_argb + src_stride;
+ __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2,
+ shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+ tmp3, reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1);
+ dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
+ __lsx_vst(dst0, dst_argb, 0);
+ s += 32;
+ t += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ int32_t stepx = src_stepx << 2;
+ (void)src_stride;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ dst0 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ dst1 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ dst2 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ dst3 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ __lsx_vstelm_w(dst0, dst_argb, 0, 0);
+ __lsx_vstelm_w(dst1, dst_argb, 4, 0);
+ __lsx_vstelm_w(dst2, dst_argb, 8, 0);
+ __lsx_vstelm_w(dst3, dst_argb, 12, 0);
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ int32_t stepx = src_stepx * 4;
+ const uint8_t* next_argb = src_argb + src_stride;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, dst0;
+
+ for (x = 0; x < len; x++) {
+ tmp0 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp1 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp2 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp3 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp4 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ tmp5 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ tmp6 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ tmp7 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1);
+ DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1);
+ dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
+ dst0 = __lsx_vshuf4i_b(dst0, 0xD8);
+ __lsx_vst(dst0, dst_argb, 0);
+ dst_argb += 16;
+ }
+}
+
+void ScaleRowDown2_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 32;
+ __m128i src0, src1, src2, src3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 32;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 32;
+ const uint8_t* src_nex = src_ptr + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+ src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ src_ptr += 64;
+ src_nex += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown4_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 16;
+ __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1);
+ dst0 = __lsx_vpickod_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst, 0);
+ src_ptr += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 16;
+ const uint8_t* ptr1 = src_ptr + src_stride;
+ const uint8_t* ptr2 = ptr1 + src_stride;
+ const uint8_t* ptr3 = ptr2 + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5,
+ src6, src7);
+ DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1,
+ src2, src3);
+ DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5,
+ src6, src7);
+ DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+ reg3, reg0, reg1, reg2, reg3);
+ DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1);
+ dst0 = __lsx_vpickev_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst, 0);
+ src_ptr += 64;
+ ptr1 += 64;
+ ptr2 += 64;
+ ptr3 += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown38_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x, len;
+ __m128i src0, src1, tmp0;
+ __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816};
+
+ assert(dst_width % 3 == 0);
+ len = dst_width / 12;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ tmp0 = __lsx_vshuf_b(src1, src0, shuff);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst, 8, 2);
+ src_ptr += 32;
+ dst += 12;
+ }
+}
+
+void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, len;
+ const uint8_t* src_nex = src_ptr + src_stride;
+ __m128i src0, src1, src2, src3, dst0;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
+ __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA);
+ __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ len = dst_width / 12;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
+ DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
+ tmp4 = __lsx_vpickev_w(reg3, reg2);
+ tmp5 = __lsx_vadd_h(reg0, reg1);
+ tmp6 = __lsx_vadd_h(tmp5, tmp4);
+ tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA);
+ tmp0 = __lsx_vpickod_w(reg3, reg2);
+ tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
+ tmp2 = __lsx_vmul_w(tmp1, const_0x4000);
+ dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
+ __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
+ __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
+ src_ptr += 32;
+ src_nex += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, len;
+ const uint8_t* ptr1 = src_ptr + src_stride;
+ const uint8_t* ptr2 = ptr1 + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, dst0;
+ __m128i zero = __lsx_vldi(0);
+ __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
+ __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71);
+ __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ len = dst_width / 12;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1,
+ src2, src3);
+ DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5);
+ DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
+ DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
+ tmp4 = __lsx_vpickev_w(reg3, reg2);
+ tmp5 = __lsx_vadd_h(reg0, reg1);
+ tmp6 = __lsx_vadd_h(tmp5, tmp4);
+ tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71);
+ tmp0 = __lsx_vpickod_w(reg3, reg2);
+ tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
+ tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA);
+ dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
+ __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
+ __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
+ src_ptr += 32;
+ ptr1 += 32;
+ ptr2 += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ int x;
+ int len = src_width / 16;
+ __m128i src0, tmp0, tmp1, dst0, dst1;
+ __m128i zero = __lsx_vldi(0);
+
+ assert(src_width > 0);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_ptr, 0);
+ DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1);
+ tmp0 = __lsx_vilvl_b(zero, src0);
+ tmp1 = __lsx_vilvh_b(zero, src0);
+ DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1);
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr, 16);
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+}
+
+void ScaleFilterCols_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ int len = dst_width / 16;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i vec0, vec1, dst0;
+ __m128i vec_x = __lsx_vreplgr2vr_w(x);
+ __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+ __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF);
+ __m128i const2 = __lsx_vreplgr2vr_w(0x40);
+ __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+
+ vec0 = __lsx_vmul_w(vec_dx, const_tmp);
+ vec1 = __lsx_vslli_w(vec_dx, 2);
+ vec_x = __lsx_vadd_w(vec_x, vec0);
+
+ for (j = 0; j < len; j++) {
+ tmp0 = __lsx_vsrai_w(vec_x, 16);
+ tmp4 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp1 = __lsx_vsrai_w(vec_x, 16);
+ tmp5 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp2 = __lsx_vsrai_w(vec_x, 16);
+ tmp6 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp3 = __lsx_vsrai_w(vec_x, 16);
+ tmp7 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5,
+ tmp6, tmp7);
+ LOAD_DATA(src_ptr, tmp0, reg0);
+ LOAD_DATA(src_ptr, tmp1, reg1);
+ LOAD_DATA(src_ptr, tmp2, reg2);
+ LOAD_DATA(src_ptr, tmp3, reg3);
+ DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1,
+ tmp2, tmp3);
+ LOAD_DATA(src_ptr, tmp0, reg4);
+ LOAD_DATA(src_ptr, tmp1, reg5);
+ LOAD_DATA(src_ptr, tmp2, reg6);
+ LOAD_DATA(src_ptr, tmp3, reg7);
+ DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7,
+ const2, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5,
+ reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+ reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ dst0 = __lsx_vpickev_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst_ptr, 0);
+ dst_ptr += 16;
+ }
+}
+
+void ScaleARGBCols_LSX(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)src_argb;
+ uint32_t* dst = (uint32_t*)dst_argb;
+ int j;
+ int len = dst_width / 4;
+ __m128i tmp0, tmp1, tmp2, dst0;
+ __m128i vec_x = __lsx_vreplgr2vr_w(x);
+ __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+ __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+
+ tmp0 = __lsx_vmul_w(vec_dx, const_tmp);
+ tmp1 = __lsx_vslli_w(vec_dx, 2);
+ vec_x = __lsx_vadd_w(vec_x, tmp0);
+
+ for (j = 0; j < len; j++) {
+ tmp2 = __lsx_vsrai_w(vec_x, 16);
+ vec_x = __lsx_vadd_w(vec_x, tmp1);
+ LOAD_DATA(src, tmp2, dst0);
+ __lsx_vst(dst0, dst, 0);
+ dst += 4;
+ }
+}
+
+void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)src_argb;
+ int j;
+ int len = dst_width / 8;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i vec0, vec1, dst0, dst1;
+ __m128i vec_x = __lsx_vreplgr2vr_w(x);
+ __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+ __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+ __m128i const_7f = __lsx_vldi(0x7F);
+
+ vec0 = __lsx_vmul_w(vec_dx, const_tmp);
+ vec1 = __lsx_vslli_w(vec_dx, 2);
+ vec_x = __lsx_vadd_w(vec_x, vec0);
+
+ for (j = 0; j < len; j++) {
+ tmp0 = __lsx_vsrai_w(vec_x, 16);
+ reg0 = __lsx_vsrai_w(vec_x, 9);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp1 = __lsx_vsrai_w(vec_x, 16);
+ reg1 = __lsx_vsrai_w(vec_x, 9);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1);
+ DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1);
+ DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6);
+ DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7);
+ LOAD_DATA(src, tmp0, src0);
+ LOAD_DATA(src, tmp1, src1);
+ DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1);
+ LOAD_DATA(src, tmp0, src2);
+ LOAD_DATA(src, tmp1, src3);
+ DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ dst_argb += 32;
+ }
+}
+
+void ScaleRowDown34_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2;
+ __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B};
+ __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110};
+ __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0,
+ dst1);
+ dst2 = __lsx_vshuf_b(src3, src2, shuff2);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ src_ptr += 64;
+ dst += 48;
+ }
+}
+
+void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* src_nex = src_ptr + src_stride;
+ int x;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+ __m128i tmp10, tmp11, dst0, dst1, dst2;
+ __m128i const0 = {0x0103030101010103, 0x0101010303010101};
+ __m128i const1 = {0x0301010101030301, 0x0103030101010103};
+ __m128i const2 = {0x0101010303010101, 0x0301010101030301};
+ __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
+ __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
+ __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
+ __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
+ __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
+ __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+ src4, src5, src6, src7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
+ shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
+ shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
+ shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
+ const0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
+ const1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
+ const2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
+ shift0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
+ shift1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
+ shift2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6,
+ tmp7, tmp8);
+ DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10);
+ DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5);
+ DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
+ DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1);
+ dst2 = __lsx_vsrarni_b_h(src5, src4, 2);
+ __lsx_vst(dst0, d, 0);
+ __lsx_vst(dst1, d, 16);
+ __lsx_vst(dst2, d, 32);
+ src_ptr += 64;
+ src_nex += 64;
+ d += 48;
+ }
+}
+
+void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* src_nex = src_ptr + src_stride;
+ int x;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+ __m128i tmp10, tmp11, dst0, dst1, dst2;
+ __m128i const0 = {0x0103030101010103, 0x0101010303010101};
+ __m128i const1 = {0x0301010101030301, 0x0103030101010103};
+ __m128i const2 = {0x0101010303010101, 0x0301010101030301};
+ __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
+ __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
+ __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
+ __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
+ __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
+ __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+ src4, src5, src6, src7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
+ shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
+ shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
+ shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
+ const0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
+ const1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
+ const2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
+ shift0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
+ shift1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
+ shift2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
+ DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1);
+ dst2 = __lsx_vsrarni_b_h(src5, src4, 1);
+ __lsx_vst(dst0, d, 0);
+ __lsx_vst(dst1, d, 16);
+ __lsx_vst(dst2, d, 32);
+ src_ptr += 64;
+ src_nex += 64;
+ d += 48;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/scale_msa.cc b/source/scale_msa.cc
index 482a521f..482a521f 100644
--- a/files/source/scale_msa.cc
+++ b/source/scale_msa.cc
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
new file mode 100644
index 00000000..ccc75106
--- /dev/null
+++ b/source/scale_neon.cc
@@ -0,0 +1,1533 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ // row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1 + 2) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+ "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
+ 22, 24, 27, 30, 0, 0, 0, 0};
+static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
+ 18, 6, 14, 19, 0, 0, 0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+
+ asm volatile(
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+ "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
+}
+
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "vmov.u8 d30, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 01234567
+ "vld1.8 {d5}, [%3]! \n" // 12345678
+
+ "vmovl.u8 q0, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d5 \n" // 12345678 (16b)
+ "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
+ "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.8 {d0, d1}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 1;
+ const uint8_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+ "vmov.u8 d28, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 01234567
+ "vld1.8 {d5}, [%5]! \n" // 12345678
+
+ "vmovl.u8 q0, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d5 \n" // 12345678 (16b)
+ "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
+ "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d8}, [%1]! \n"
+ "vld1.8 {d9}, [%6]! \n"
+
+ "vmovl.u8 q2, d8 \n"
+ "vmovl.u8 q3, d9 \n"
+ "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
+ "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
+
+ // e o
+ // q1 q0
+ // q3 q2
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ // e o
+ // q5 q4
+ // q1 q0
+
+ "vrshrn.u16 d2, q1, #4 \n" // 2, even
+ "vrshrn.u16 d3, q0, #4 \n" // 2, odd
+ "vrshrn.u16 d0, q5, #4 \n" // 1, even
+ "vrshrn.u16 d1, q4, #4 \n" // 1, odd
+
+ "vst2.8 {d0, d1}, [%2]! \n" // store
+ "vst2.8 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
+
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
+
+ "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
+
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
+
+ "vld1.16 {q2}, [%1]! \n" // 01234567 (16b)
+ "vld1.16 {q3}, [%6]! \n" // 12345678 (16b)
+
+ "vmovq q4, q2 \n"
+ "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q3, q4, q15 \n" // 3*near+far (even)
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ "vrshr.u16 q2, q1, #4 \n" // 2, even
+ "vrshr.u16 q3, q0, #4 \n" // 2, odd
+ "vrshr.u16 q0, q5, #4 \n" // 1, even
+ "vrshr.u16 q1, q4, #4 \n" // 1, odd
+
+ "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store
+ "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store
+ "subs %4, %4, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "vmov.u16 d31, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q1}, [%3]! \n" // 12345678 (16b)
+
+ "vmovl.u16 q2, d0 \n" // 0123 (32b)
+ "vmovl.u16 q3, d1 \n" // 4567 (32b)
+ "vmovl.u16 q4, d2 \n" // 1234 (32b)
+ "vmovl.u16 q5, d3 \n" // 5678 (32b)
+
+ "vmlal.u16 q2, d2, d31 \n"
+ "vmlal.u16 q3, d3, d31 \n"
+ "vmlal.u16 q4, d0, d31 \n"
+ "vmlal.u16 q5, d1, d31 \n"
+
+ "vrshrn.u32 d0, q4, #2 \n"
+ "vrshrn.u32 d1, q5, #2 \n"
+ "vrshrn.u32 d2, q2, #2 \n"
+ "vrshrn.u32 d3, q3, #2 \n"
+
+ "vst2.16 {q0, q1}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "vmov.u16 d31, #3 \n"
+ "vmov.u32 q14, #3 \n"
+
+ "1: \n"
+ "vld1.16 {d0}, [%0]! \n" // 0123 (16b)
+ "vld1.16 {d1}, [%5]! \n" // 1234 (16b)
+ "vmovl.u16 q2, d0 \n" // 0123 (32b)
+ "vmovl.u16 q3, d1 \n" // 1234 (32b)
+ "vmlal.u16 q2, d1, d31 \n"
+ "vmlal.u16 q3, d0, d31 \n"
+
+ "vld1.16 {d0}, [%1]! \n" // 0123 (16b)
+ "vld1.16 {d1}, [%6]! \n" // 1234 (16b)
+ "vmovl.u16 q4, d0 \n" // 0123 (32b)
+ "vmovl.u16 q5, d1 \n" // 1234 (32b)
+ "vmlal.u16 q4, d1, d31 \n"
+ "vmlal.u16 q5, d0, d31 \n"
+
+ "vmovq q0, q4 \n"
+ "vmovq q1, q5 \n"
+ "vmla.u32 q4, q2, q14 \n"
+ "vmla.u32 q5, q3, q14 \n"
+ "vmla.u32 q2, q0, q14 \n"
+ "vmla.u32 q3, q1, q14 \n"
+
+ "vrshrn.u32 d1, q4, #4 \n"
+ "vrshrn.u32 d0, q5, #4 \n"
+ "vrshrn.u32 d3, q2, #4 \n"
+ "vrshrn.u32 d2, q3, #4 \n"
+
+ "vst2.16 {d0, d1}, [%2]! \n" // store
+ "vst2.16 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #8 \n" // 4 sample -> 8 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+ "d31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "vmov.u8 d30, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
+ "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
+ "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.16 {d0, d1}, [%1]! \n" // store
+ "subs %2, %2, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 2;
+ const uint8_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+ "vmov.u8 d28, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
+ "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
+ "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v)
+ "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
+ "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
+
+ // e o
+ // q1 q0
+ // q3 q2
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ // e o
+ // q5 q4
+ // q1 q0
+
+ "vrshrn.u16 d2, q1, #4 \n" // 2, even
+ "vrshrn.u16 d3, q0, #4 \n" // 2, odd
+ "vrshrn.u16 d0, q5, #4 \n" // 1, even
+ "vrshrn.u16 d1, q4, #4 \n" // 1, odd
+
+ "vst2.16 {d0, d1}, [%2]! \n" // store
+ "vst2.16 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "vmov.u16 d30, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
+ "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
+
+ "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
+ "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b)
+ "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b)
+ "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b)
+ "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd)
+ "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even)
+ "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd)
+ "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even)
+ "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.32 {d0, d1}, [%1]! \n" // store
+ "vst2.32 {d2, d3}, [%1]! \n" // store
+ "subs %2, %2, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ "d30" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 2;
+ const uint16_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "vmov.u16 d30, #3 \n"
+ "vmov.u32 q14, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
+ "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
+ "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
+ "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b)
+ "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd)
+ "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v)
+ "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v)
+ "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b)
+ "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b)
+ "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd)
+ "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even)
+
+ "vmovq q0, q4 \n"
+ "vmovq q1, q5 \n"
+ "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd)
+ "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even)
+ "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd)
+ "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even)
+
+ "vrshrn.u32 d1, q4, #4 \n" // 1, odd
+ "vrshrn.u32 d0, q5, #4 \n" // 1, even
+ "vrshrn.u32 d3, q2, #4 \n" // 2, odd
+ "vrshrn.u32 d2, q3, #4 \n" // 2, even
+
+ "vst2.32 {d0, d1}, [%2]! \n" // store
+ "vst2.32 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #4 \n" // 2 uv -> 4 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+ "d30" // Clobber List
+ );
+}
+
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_ptr;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q1, q1, q0 \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
+// 4a: 3e04 subs r6, #4
+// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
+// 50: ef64 21f4 vorr q9, q10, q10
+// 54: f942 038d vst2.32 {d16-d19}, [r2]!
+// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "mov r12, %3, lsl #2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ asm volatile(
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld1.32 {" #dn "[" #n "]}, [%6] \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int tmp;
+ const uint8_t* src_tmp = src_argb;
+ asm volatile(
+ "1: \n"
+ // clang-format off
+ LOAD1_DATA32_LANE(d0, 0)
+ LOAD1_DATA32_LANE(d0, 1)
+ LOAD1_DATA32_LANE(d1, 0)
+ LOAD1_DATA32_LANE(d1, 1)
+ LOAD1_DATA32_LANE(d2, 0)
+ LOAD1_DATA32_LANE(d2, 1)
+ LOAD1_DATA32_LANE(d3, 0)
+ LOAD1_DATA32_LANE(d3, 1)
+ // clang-format on
+ "vst1.32 {q0, q1}, [%0]! \n" // store pixels
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "=&r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_argb;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(d0, d2, 0)
+ LOAD2_DATA32_LANE(d0, d2, 1)
+ LOAD2_DATA32_LANE(d1, d3, 0)
+ LOAD2_DATA32_LANE(d1, d3, 1)
+ "vshrn.i32 d22, q8, #9 \n"
+ "vand.16 d22, d22, d30 \n"
+ "vdup.8 d24, d22[0] \n"
+ "vdup.8 d25, d22[2] \n"
+ "vdup.8 d26, d22[4] \n"
+ "vdup.8 d27, d22[6] \n"
+ "vext.8 d4, d24, d25, #4 \n"
+ "vext.8 d5, d26, d27, #4 \n" // f
+ "veor.8 q10, q2, q3 \n" // 0x7f ^ f
+ "vmull.u8 q11, d0, d20 \n"
+ "vmull.u8 q12, d1, d21 \n"
+ "vmull.u8 q13, d2, d4 \n"
+ "vmull.u8 q14, d3, d5 \n"
+ "vadd.i16 q11, q11, q13 \n"
+ "vadd.i16 q12, q12, q14 \n"
+ "vshrn.i16 d0, q11, #7 \n"
+ "vshrn.i16 d1, q12, #7 \n"
+
+ "vst1.32 {d0, d1}, [%0]! \n" // store pixels
+ "vadd.s32 q8, q8, q9 \n"
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst1.16 {q1}, [%1]! \n" // store 8 UV
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.16 {q0}, [%1]! \n" // store 8 UV
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
+ "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
+ "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
+ "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst2.8 {d0, d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.16 {d0[0]}, [%0], %6 \n"
+ "vld1.16 {d0[1]}, [%1], %6 \n"
+ "vld1.16 {d0[2]}, [%2], %6 \n"
+ "vld1.16 {d0[3]}, [%3], %6 \n"
+ "subs %5, %5, #4 \n" // 4 pixels per loop.
+ "vst1.8 {d0}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"(src_stepx * 8) // %6
+ : "memory", "cc", "d0");
+}
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
new file mode 100644
index 00000000..7c072380
--- /dev/null
+++ b/source/scale_neon64.cc
@@ -0,0 +1,1578 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uadalp v0.8h, v3.16b \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(src_ptr2), // %3
+ "+r"(src_ptr3), // %4
+ "+r"(dst_width) // %5
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
+
+ // 3 * line_0 + line_1
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // (3 * line_0 + line_1 + 2) >> 2
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v20", "memory", "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+ // average src line 0 with src line 1
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
+}
+
+static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
+ 22, 24, 27, 30, 0, 0, 0, 0};
+static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
+ 34, 6, 22, 35, 0, 0, 0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+ ptrdiff_t tmp_src_stride = src_stride;
+
+ asm volatile(
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+
+ // combine source lines
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // combine source lines
+ "add v0.8h, v0.8h, v16.8h \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // 0+1+2, 3+4+5
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // Align for table lookup, vtbl requires registers to be adjacent
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(dst_width) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+ "memory", "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ // TODO(fbarchard): use src_stride directly for clang 3.5+.
+ ptrdiff_t tmp_src_stride = src_stride;
+ asm volatile(
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+
+ // combine source lines
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "uqrshrn v2.8b, v2.8h, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+ // combine source lines
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // 0+1+2, 3+4+5
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(dst_width) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v30", "v31", "memory", "cc");
+}
+
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "movi v31.8b, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 01234567
+ "ldr d1, [%1], #8 \n" // 12345678
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
+
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
+
+ "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 1;
+ const uint8_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "movi v31.8b, #3 \n"
+ "movi v30.8h, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 01234567
+ "ldr d1, [%2], #8 \n" // 12345678
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b)
+ "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
+ "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
+
+ "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
+ "rshrn v1.8b, v3.8h, #4 \n" // 2, even
+ "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
+ "rshrn v3.8b, v5.8h, #4 \n" // 1, even
+
+ "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1
+ "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2
+ "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "mov v2.16b, v0.16b \n"
+ "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even)
+
+ "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "mov v0.16b, v2.16b \n"
+ "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b)
+ "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "mov v0.16b, v4.16b \n"
+ "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even)
+
+ "urshr v2.8h, v2.8h, #4 \n" // 2, odd
+ "urshr v1.8h, v3.8h, #4 \n" // 2, even
+ "urshr v4.8h, v4.8h, #4 \n" // 1, odd
+ "urshr v3.8h, v5.8h, #4 \n" // 1, even
+
+ "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1
+ "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2
+
+ "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b)
+ "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b)
+ "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b)
+ "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b)
+
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
+ "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd)
+ "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
+ "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even)
+
+ "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far
+ "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
+ "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far
+ "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd)
+
+ "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "movi v31.4h, #3 \n"
+ "movi v30.4s, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 0123 (16b)
+ "ldr d1, [%2], #8 \n" // 1234 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b)
+ "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b)
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
+ "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n" // 0123 (16b)
+ "ldr d1, [%3], #8 \n" // 1234 (16b)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b)
+ "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b)
+ "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
+ "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
+ "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
+ "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
+ "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
+
+ "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far
+ "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far
+ "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far
+ "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far
+
+ "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1
+ "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2
+
+ "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "movi v31.8b, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 00112233 (1u1v)
+ "ldr d1, [%1], #8 \n" // 11223344 (1u1v)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b)
+
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
+
+ "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store
+ "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 2;
+ const uint8_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "movi v31.8b, #3 \n"
+ "movi v30.8h, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n"
+ "ldr d1, [%2], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n"
+ "ushll v3.8h, v1.8b, #0 \n"
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v4.8h, v0.8b, #0 \n"
+ "ushll v5.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
+ "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
+
+ "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
+ "rshrn v1.8b, v3.8h, #4 \n" // 2, even
+ "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
+ "rshrn v3.8b, v5.8h, #4 \n" // 1, even
+
+ "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2
+ "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1
+ "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
+ "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
+ "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b)
+ "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b)
+
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd)
+ "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even)
+ "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd)
+ "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even)
+ "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store
+ "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store
+ "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 2;
+ const uint16_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "movi v31.4h, #3 \n"
+ "movi v30.4s, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n"
+ "ldr d1, [%2], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
+ "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
+ "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
+ "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
+ "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
+ "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
+ "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
+ "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
+ "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
+
+ "rshrn v1.4h, v2.4s, #4 \n" // 2, odd
+ "rshrn v0.4h, v3.4s, #4 \n" // 2, even
+ "rshrn v3.4h, v4.4s, #4 \n" // 1, odd
+ "rshrn v2.4h, v5.4s, #4 \n" // 1, even
+
+ "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2
+ "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1
+ "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_ptr;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v1.4s, v1.4s, v0.4s \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3",
+ "v4", "v5", "v6", "v7", "v16", "v17"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((int64_t)(src_stepx * 4)) // %3
+ : "memory", "cc", "v0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ asm volatile(
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"((int64_t)(src_stepx * 4)) // %4
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld1 {" #vn ".s}[" #n "], [%6] \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint8_t* src_tmp = src_argb;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ int64_t tmp64;
+ asm volatile(
+ "1: \n"
+ // clang-format off
+ LOAD1_DATA32_LANE(v0, 0)
+ LOAD1_DATA32_LANE(v0, 1)
+ LOAD1_DATA32_LANE(v0, 2)
+ LOAD1_DATA32_LANE(v0, 3)
+ LOAD1_DATA32_LANE(v1, 0)
+ LOAD1_DATA32_LANE(v1, 1)
+ LOAD1_DATA32_LANE(v1, 2)
+ LOAD1_DATA32_LANE(v1, 3)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ // clang-format on
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "=&r"(tmp64), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_argb;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(v0, v1, 0)
+ LOAD2_DATA32_LANE(v0, v1, 1)
+ LOAD2_DATA32_LANE(v0, v1, 2)
+ LOAD2_DATA32_LANE(v0, v1, 3)
+ "shrn v2.4h, v5.4s, #9 \n"
+ "and v2.8b, v2.8b, v4.8b \n"
+ "dup v16.8b, v2.b[0] \n"
+ "dup v17.8b, v2.b[2] \n"
+ "dup v18.8b, v2.b[4] \n"
+ "dup v19.8b, v2.b[6] \n"
+ "ext v2.8b, v16.8b, v17.8b, #4 \n"
+ "ext v17.8b, v18.8b, v19.8b, #4 \n"
+ "ins v2.d[1], v17.d[0] \n" // f
+ "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
+ "umull v16.8h, v0.8b, v7.8b \n"
+ "umull2 v17.8h, v0.16b, v7.16b \n"
+ "umull v18.8h, v1.8b, v2.8b \n"
+ "umull2 v19.8h, v1.16b, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "add v16.8h, v16.8h, v18.8h \n"
+ "add v17.8h, v17.8h, v19.8h \n"
+ "shrn v0.8b, v16.8h, #7 \n"
+ "shrn2 v0.16b, v17.8h, #7 \n"
+ "st1 {v0.4s}, [%0], #16 \n" // store pixels
+ "add v5.4s, v5.4s, v6.4s \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v16", "v17", "v18", "v19"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "1: \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ asm volatile(
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
+
+ "1: \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ : "r"(2LL), // %4
+ "r"(14LL) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19" // Clobber List
+ );
+}
+
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.8h}, [%1], #16 \n" // store 8 UV
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.8h}, [%1], #16 \n" // store 8 UV
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
+ "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
+ "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"((int64_t)(src_stepx * 8)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/scale_rgb.cc b/source/scale_rgb.cc
new file mode 100644
index 00000000..8db59b56
--- /dev/null
+++ b/source/scale_rgb.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h" /* For FilterMode */
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/scale_rgb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Scale a 24 bit image.
+// Converts to ARGB as intermediate step
+
+LIBYUV_API
+int RGBScale(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int r;
+ uint8_t* src_argb =
+ (uint8_t*)malloc(src_width * src_height * 4 + dst_width * dst_height * 4);
+ uint8_t* dst_argb = src_argb + src_width * src_height * 4;
+
+ if (!src_argb) {
+ return 1;
+ }
+
+ r = RGB24ToARGB(src_rgb, src_stride_rgb, src_argb, src_width * 4, src_width,
+ src_height);
+ if (!r) {
+ r = ARGBScale(src_argb, src_width * 4, src_width, src_height, dst_argb,
+ dst_width * 4, dst_width, dst_height, filtering);
+ if (!r) {
+ r = ARGBToRGB24(dst_argb, dst_width * 4, dst_rgb, dst_stride_rgb,
+ dst_width, dst_height);
+ }
+ }
+ free(src_argb);
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
new file mode 100644
index 00000000..de037e45
--- /dev/null
+++ b/source/scale_rvv.cc
@@ -0,0 +1,1040 @@
+/*
+ * Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+ defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_SCALEADDROW_RVV
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ size_t w = (size_t)src_width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl);
+ vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl);
+ // Use widening multiply-add instead of widening + add
+ v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl);
+ __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl);
+ w -= vl;
+ src_ptr += vl;
+ dst_ptr += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_RVV
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ size_t w = (size_t)dst_width;
+ const uint64_t* src = (const uint64_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ do {
+ size_t vl = __riscv_vsetvl_e64m8(w);
+ vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl);
+ vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl);
+ __riscv_vse32_v_u32m4(dst, v_dst, vl);
+ w -= vl;
+ src += vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ size_t w = (size_t)dst_width;
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m4_t v_odd, v_even, v_dst;
+ vuint32m4_t v_odd_32, v_even_32;
+ size_t vl = __riscv_vsetvl_e32m4(w);
+ __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
+ v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
+ v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
+ // Use round-to-nearest-up mode for averaging add
+ v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4);
+ __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+ w -= vl;
+ src += vl * 2;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const uint32_t* src0 = (const uint32_t*)(src_argb);
+ const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
+ vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
+ vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32;
+ size_t vl = __riscv_vsetvl_e32m4(w);
+ __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl);
+ __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl);
+ v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
+ v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
+ v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
+ v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
+ v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
+ v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
+ v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+ // Use round-to-nearest-up mode for vnclip
+ v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4);
+ __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+ w -= vl;
+ src0 += vl * 2;
+ src1 += vl * 2;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_RVV
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ const int stride_byte = src_stepx * 4;
+ do {
+ size_t vl = __riscv_vsetvl_e32m8(w);
+ vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl);
+ __riscv_vse32_v_u32m8(dst, v_row, vl);
+ w -= vl;
+ src += vl * src_stepx;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const uint32_t* src0 = (const uint32_t*)(src_argb);
+ const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+ const int stride_byte = src_stepx * 4;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
+ vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
+ vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32;
+ size_t vl = __riscv_vsetvl_e32m4(w);
+ __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0,
+ stride_byte, vl);
+ __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1,
+ stride_byte, vl);
+ v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
+ v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
+ v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
+ v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
+ v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
+ v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
+ v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+ // Use round-to-nearest-up mode for vnclip
+ v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4);
+ __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+ w -= vl;
+ src0 += vl * src_stepx;
+ src1 += vl * src_stepx;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_RVV
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const uint16_t* src = (const uint16_t*)src_ptr;
+ (void)src_stride;
+ do {
+ size_t vl = __riscv_vsetvl_e16m8(w);
+ vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl);
+ vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl);
+ __riscv_vse8_v_u8m4(dst, v_dst, vl);
+ w -= vl;
+ src += vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2LINEAR_RVV
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ (void)src_stride;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m4_t v_s0, v_s1, v_dst;
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl);
+ // Use round-to-nearest-up mode for averaging add
+ v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl);
+ __riscv_vse8_v_u8m4(dst, v_dst, vl);
+ w -= vl;
+ src_ptr += 2 * vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2BOX_RVV
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ size_t w = (size_t)dst_width;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ vuint8m4_t v_s0, v_s1, v_t0, v_t1;
+ vuint16m8_t v_s01, v_t01, v_st01;
+ vuint8m4_t v_dst;
+ __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl);
+ __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl);
+ v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
+ v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
+ v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
+ // Use round-to-nearest-up mode for vnclip
+ v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl);
+ __riscv_vse8_v_u8m4(dst, v_dst, vl);
+ w -= vl;
+ s += 2 * vl;
+ t += 2 * vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4_RVV
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ (void)src_stride;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+ __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+ __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
+ w -= vl;
+ src_ptr += (4 * vl);
+ dst_ptr += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4BOX_RVV
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ size_t w = (size_t)dst_width;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+ vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+ vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+ vuint8m2_t v_v0, v_v1, v_v2, v_v3;
+ vuint16m4_t v_s01, v_s23, v_t01, v_t23;
+ vuint16m4_t v_u01, v_u23, v_v01, v_v23;
+ vuint16m4_t v_st01, v_st23, v_uv01, v_uv23;
+ vuint16m4_t v_st0123, v_uv0123, v_stuv0123;
+ vuint8m2_t v_dst;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+
+ __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+ v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
+
+ __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl);
+ v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
+
+ __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl);
+ v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
+ v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
+
+ v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
+ v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
+ v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
+ v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
+
+ __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl);
+
+ v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
+ v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
+
+ v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
+ v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
+
+ v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
+ v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
+ v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
+ // Use round-to-nearest-up mode for vnclip
+ v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl);
+ __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
+ w -= vl;
+ src_ptr += 4 * vl;
+ src_ptr1 += 4 * vl;
+ src_ptr2 += 4 * vl;
+ src_ptr3 += 4 * vl;
+ dst_ptr += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_RVV
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ do {
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+ __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+ __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
+ w -= vl;
+ src_ptr += 4 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+ vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
+ vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+ vuint16m4_t v_u1_u16;
+ vuint8m2_t v_a0, v_a1, v_a2;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+ if (src_stride == 0) {
+ v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+ v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+ v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
+ v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
+ } else {
+ vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+ __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+ v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
+ v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
+ v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
+ v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
+ t += 4 * vl;
+ }
+
+ v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
+ v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
+ v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
+ v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
+
+ // Use round-to-nearest-up mode for vnclip & averaging add
+ v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
+ v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
+ v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
+ v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
+ v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
+ v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
+ v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
+ v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+ __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+ w -= vl;
+ s += 4 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+ vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
+ vuint16m4_t v_u1_u16;
+ vuint8m2_t v_a0, v_a1, v_a2;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+ __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+ // Use round-to-nearest-up mode for vnclip & averaging add
+ if (src_stride == 0) {
+ v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
+ v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
+ v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
+ v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
+ } else {
+ vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+ __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+ v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
+ v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
+ v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
+ v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
+ t += 4 * vl;
+ }
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
+ v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
+ v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
+ v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
+ v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+ __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+ w -= vl;
+ s += 4 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_RVV
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ (void)src_stride;
+ assert(dst_width % 3 == 0);
+ do {
+ vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+ size_t vl = __riscv_vsetvl_e8m1(w);
+ __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+ &v_s7, src_ptr, vl);
+ __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
+ w -= vl;
+ src_ptr += 8 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ const uint16_t coeff_a = (65536u / 6u);
+ const uint16_t coeff_b = (65536u / 4u);
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ do {
+ vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+ vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+ vuint16m2_t v_e0, v_e1, v_e2, v_e;
+ vuint16m2_t v_f0, v_f1, v_f2, v_f;
+ vuint16m2_t v_g0, v_g1, v_g;
+ vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+ size_t vl = __riscv_vsetvl_e8m1(w);
+ // s: e00, e10, e20, f00, f10, f20, g00, g10
+ // t: e01, e11, e21, f01, f11, f21, g01, g11
+ __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+ &v_s7, src_ptr, vl);
+ __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+ &v_t7, src_ptr + src_stride, vl);
+ // Calculate sum of [e00, e21] to v_e
+ // Calculate sum of [f00, f21] to v_f
+ // Calculate sum of [g00, g11] to v_g
+ v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+ v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+ v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+ v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+ v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+ v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+ v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+ v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+
+ v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+ v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+ v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+ v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+ v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+
+ // Average in 16-bit fixed-point
+ v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+ v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+ v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+ v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+ v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+ v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+ __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+ w -= vl;
+ src_ptr += 8 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t w = (size_t)dst_width / 3u;
+ const uint16_t coeff_a = (65536u / 9u);
+ const uint16_t coeff_b = (65536u / 6u);
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ do {
+ vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+ vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+ vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
+ vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
+ vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
+ vuint16m2_t v_g0, v_g1, v_g2, v_g;
+ vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+ size_t vl = __riscv_vsetvl_e8m1(w);
+ // s: e00, e10, e20, f00, f10, f20, g00, g10
+ // t: e01, e11, e21, f01, f11, f21, g01, g11
+ // u: e02, e12, e22, f02, f12, f22, g02, g12
+ __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+ &v_s7, src_ptr, vl);
+ __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+ &v_t7, src_ptr + src_stride, vl);
+ __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
+ &v_u7, src_ptr + 2 * src_stride, vl);
+ // Calculate sum of [e00, e22]
+ v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+ v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+ v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+ v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
+ v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
+
+ v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+ v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
+ v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
+ v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+ // Calculate sum of [f00, f22]
+ v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+ v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+ v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+ v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
+ v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
+
+ v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+ v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
+ v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
+ v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+ // Calculate sum of [g00, g12]
+ v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+ v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+ v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
+
+ v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+ v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
+
+ // Average in 16-bit fixed-point
+ v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+ v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+ v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+ v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+ v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+ v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+ __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+ w -= vl;
+ src_ptr += 8 * vl;
+ dst_ptr += 3 * vl;
+ } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
+// platforms only implement non-edge part of image and process edge with scalar.
+
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t work_width = (size_t)dst_width - 1u;
+ size_t src_width = work_width >> 1u;
+ const uint8_t* work_src_ptr = src_ptr;
+ uint8_t* work_dst_ptr = dst_ptr + 1;
+ size_t vl = __riscv_vsetvlmax_e8m4();
+ vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
+ dst_ptr[0] = src_ptr[0];
+ while (src_width > 0) {
+ vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
+ vuint16m8_t v_src0_u16, v_src1_u16;
+ size_t vl = __riscv_vsetvl_e8m4(src_width);
+ v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+ v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
+
+ v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
+ v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
+ v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
+ v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
+
+ v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
+ v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
+
+ __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl);
+
+ src_width -= vl;
+ work_src_ptr += vl;
+ work_dst_ptr += 2 * vl;
+ }
+ dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+ size_t src_width = work_width >> 1u;
+ const uint8_t* work_s = src_ptr;
+ const uint8_t* work_t = src_ptr + src_stride;
+ const uint8_t* s = work_s;
+ const uint8_t* t = work_t;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ uint8_t* work_d = d + 1;
+ uint8_t* work_e = e + 1;
+ size_t vl = __riscv_vsetvlmax_e16m4();
+ vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+ vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+ d[0] = (3 * s[0] + t[0] + 2) >> 2;
+ e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+ while (src_width > 0) {
+ vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+ vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+ vuint16m4_t v_t0_u16_, v_t1_u16_;
+ vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+ size_t vl = __riscv_vsetvl_e8m2(src_width);
+ v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+ v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
+
+ v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+ v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+ v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+ v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+ v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+ v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
+
+ v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+ v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+ v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+ v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+ v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+ v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+ v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+ v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+ v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+ v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+ v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+ v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+ v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+ v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+ __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl);
+ __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl);
+
+ src_width -= vl;
+ work_s += vl;
+ work_t += vl;
+ work_d += 2 * vl;
+ work_e += 2 * vl;
+ }
+ d[dst_width - 1] =
+ (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
+ e[dst_width - 1] =
+ (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2_RVV
+void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const uint32_t* src = (const uint32_t*)src_uv;
+ uint16_t* dst = (uint16_t*)dst_uv;
+ (void)src_stride;
+ do {
+ size_t vl = __riscv_vsetvl_e32m8(w);
+ vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl);
+ vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl);
+ __riscv_vse16_v_u16m4(dst, v_u1v1, vl);
+ w -= vl;
+ src += vl;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const uint16_t* src = (const uint16_t*)src_uv;
+ (void)src_stride;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m4_t v_u0v0, v_u1v1, v_avg;
+ vuint16m4_t v_u0v0_16, v_u1v1_16;
+ size_t vl = __riscv_vsetvl_e16m4(w);
+ __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
+ v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
+ v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
+ // Use round-to-nearest-up mode for averaging add
+ v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2);
+ __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
+ w -= vl;
+ src += vl * 2;
+ dst_uv += vl * 2;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_RVV
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint8_t* src_uv_row1 = src_uv + src_stride;
+ size_t w = (size_t)dst_width;
+ // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up mode(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
+ vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
+ vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1;
+ vuint16m4_t v_sum0, v_sum1;
+ vuint8m2_t v_dst_u, v_dst_v;
+ size_t vl = __riscv_vsetvl_e8m2(w);
+
+ __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0,
+ src_uv, vl);
+ __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1,
+ src_uv_row1, vl);
+
+ v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
+ v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
+ v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
+ v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
+
+ v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
+ v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
+ // Use round-to-nearest-up mode for vnclip
+ v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl);
+ v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl);
+
+ __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl);
+
+ dst_uv += 2 * vl;
+ src_uv += 4 * vl;
+ w -= vl;
+ src_uv_row1 += 4 * vl;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN4_RVV
+void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2.
+ // dst_width = src_width / 4 and src_width is also int.
+ size_t w = (size_t)dst_width * 8;
+ (void)src_stride;
+ (void)src_stepx;
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(w);
+ vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl);
+ vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row);
+ // Narrowing without clipping
+ vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8);
+ vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8);
+ vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16);
+ __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4);
+ w -= vl;
+ src_uv += vl;
+ dst_uv += vl / 4;
+ } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWNEVEN_RVV
+void ScaleUVRowDownEven_RVV(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ size_t w = (size_t)dst_width;
+ const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2;
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ (void)src_stride;
+ do {
+ size_t vl = __riscv_vsetvl_e16m8(w);
+ vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl);
+ __riscv_vse16_v_u16m8(dst, v_row, vl);
+ w -= vl;
+ src += vl * src_stepx;
+ dst += vl;
+ } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function.
+// Other platforms only implement non-edge part of image and process edge with
+// scalar.
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+ uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
+ const uint8_t* work_src_ptr = src_ptr;
+ size_t vl = __riscv_vsetvlmax_e8m4();
+ vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ while (work_width > 0) {
+ vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
+ vuint16m4_t v_dst_odd, v_dst_even;
+ vuint16m8_t v_uv0_u16, v_uv1_u16;
+ size_t vl = __riscv_vsetvl_e8m4(work_width);
+ v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+ v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
+
+ v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
+ v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
+
+ v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
+ v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
+
+ v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
+ v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
+
+ v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
+ v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
+
+ __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2);
+
+ work_width -= vl;
+ work_src_ptr += vl;
+ work_dst_ptr += vl;
+ }
+ dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
+ dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+ const uint8_t* work_s = src_ptr;
+ const uint8_t* work_t = src_ptr + src_stride;
+ const uint8_t* s = work_s;
+ const uint8_t* t = work_t;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ uint16_t* work_d = (uint16_t*)d + 1;
+ uint16_t* work_e = (uint16_t*)e + 1;
+ size_t vl = __riscv_vsetvlmax_e16m4();
+ vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+ vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+ d[0] = (3 * s[0] + t[0] + 2) >> 2;
+ e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+ d[1] = (3 * s[1] + t[1] + 2) >> 2;
+ e[1] = (s[1] + 3 * t[1] + 2) >> 2;
+ while (work_width > 0) {
+ vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+ vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+ vuint16m4_t v_t0_u16_, v_t1_u16_;
+ vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
+ vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+ size_t vl = __riscv_vsetvl_e8m2(work_width);
+ v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+ v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
+
+ v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+ v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+ v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+ v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+ v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+ v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
+
+ v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+ v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+ v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+ v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+ v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+ v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+ v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+ v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+ v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+ v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+ v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+ v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+ v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+ v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+ v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
+ v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
+ v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
+ v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
+
+ __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2);
+ __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2);
+
+ work_width -= vl;
+ work_s += vl;
+ work_t += vl;
+ work_d += vl;
+ work_e += vl;
+ }
+ d[2 * dst_width - 2] =
+ (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
+ 2;
+ e[2 * dst_width - 2] =
+ (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
+ 2;
+ d[2 * dst_width - 1] =
+ (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
+ 2;
+ e[2 * dst_width - 1] =
+ (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
+ 2;
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+ // defined(__clang__)
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
new file mode 100644
index 00000000..0931c89a
--- /dev/null
+++ b/source/scale_uv.cc
@@ -0,0 +1,1210 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_C
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+ : ScaleUVRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
+ } else {
+ src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2;
+ }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+ : ScaleUVRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+ : ScaleUVRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_RVV
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_RVV
+ : ScaleUVRowDown2Box_RVV);
+ }
+#endif
+
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+ : ScaleUVRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+ : ScaleUVRowDown2Box_SSSE3);
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+ : ScaleUVRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+ : ScaleUVRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static int ScaleUVDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of UV.
+ const int row_size = (dst_width * 2 * 2 + 15) & ~15;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_RVV;
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+ ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + row_size,
+ dst_width * 2);
+ ScaleUVRowDown2(row, row_size, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+ return 0;
+}
+#endif // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
+ void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_uv, int dst_width) =
+ filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+ : ScaleUVRowDownEven_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif // TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+ : ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
+ if (TestCpuFlag(kCpuHasRVV) && !filtering) {
+ ScaleUVRowDownEven =
+ (col_step == 4) ? ScaleUVRowDown4_RVV : ScaleUVRowDownEven_RVV;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static int ScaleUVBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2.
+ src_uv += xl * 2;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of UV.
+ {
+ const int max_y = (src_height - 1) << 16;
+ align_buffer_64(row, clip_src_width * 2);
+ if (!row)
+ return 1;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+ return 0;
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static int ScaleUVBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ InterpolateRow = InterpolateRow_RVV;
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+ }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleUVFilterCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
+
+ // Allocate 2 rows of UV.
+ const int row_size = (dst_width * 2 + 15) & ~15;
+ align_buffer_64(row, row_size * 2);
+ if (!row)
+ return 1;
+
+ uint8_t* rowptr = row;
+ int rowstride = row_size;
+ int lasty = yi;
+
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ if (src_height > 2) {
+ src += src_stride;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_uv + yi * (intptr_t)src_stride;
+ }
+ if (yi != lasty) {
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+ return 0;
+}
+#endif // HAS_SCALEUVBILINEARUP
+
+// Scale UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of NV16 to NV24.
+static void ScaleUVLinearUp2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv) {
+ void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_RVV;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of NV12 to NV24.
+static void ScaleUVBilinearUp2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+ if (TestCpuFlag(kCpuHasRVV)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_RVV;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO(fbarchard): Test performance of writing one row of destination at a
+ // time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+// Scale 16 bit UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of P210 to P410.
+static void ScaleUVLinearUp2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_uv,
+ uint16_t* dst_uv) {
+ void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale 16 bit UV, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of P010 to P410.
+static void ScaleUVBilinearUp2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO(fbarchard): Test performance of writing one row of destination at a
+ // time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x,
+ dx);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height);
+ return 0;
+}
+
+static int UVCopy_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ CopyPlane_16(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height);
+ return 0;
+}
+#endif // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static int ScaleUV(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * (intptr_t)src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64_t clipf = (int64_t)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 2;
+ dst += clip_x * 2;
+ }
+ if (clip_y) {
+ int64_t clipf = (int64_t)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * (intptr_t)src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return 0;
+ }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ return ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y,
+ dy);
+ }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+ ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return 0;
+#endif
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2,
+ src_stride, dst, dst_stride, clip_width, clip_height);
+ return 0;
+ }
+#endif
+ }
+ }
+ }
+ // HAS_SCALEPLANEVERTICAL
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering);
+ return 0;
+ }
+ if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
+ ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst);
+ return 0;
+ }
+ if ((clip_height + 1) / 2 == src_height &&
+ (clip_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst);
+ return 0;
+ }
+#if HAS_SCALEUVBILINEARUP
+ if (filtering && dy < 65536) {
+ return ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+ if (filtering) {
+ return ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ }
+#endif
+ ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+ return 0;
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ return ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv,
+ dst_stride_uv, dst_width, dst_height, 0, 0, dst_width,
+ dst_height, filtering);
+}
+
+// Scale a 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int dy = 0;
+
+ if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+ src_width = Abs(src_width);
+
+#ifdef HAS_UVCOPY
+ if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
+ if (dst_height == 1) {
+ UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv,
+ src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+ } else {
+ dy = src_height / dst_height;
+ UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv,
+ (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv,
+ dst_width, dst_height);
+ }
+
+ return 0;
+ }
+#endif
+
+ if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
+ ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
+ src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+ return 0;
+ }
+
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
+ src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+ return 0;
+ }
+
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_win.cc b/source/scale_win.cc
index c5fc86f3..ea1f95c6 100644
--- a/files/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ !defined(__clang__) && defined(_M_IX86)
// Offsets for source bytes 0 to 9
static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
diff --git a/source/test.sh b/source/test.sh
new file mode 100755
index 00000000..7f12c3c1
--- /dev/null
+++ b/source/test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -x
+
+function runbenchmark1 {
+ perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
+ perf report | grep AVX
+}
+
+runbenchmark1 ABGRToI420
+runbenchmark1 Android420ToI420
+runbenchmark1 ARGBToI420
+runbenchmark1 Convert16To8Plane
+runbenchmark1 ConvertToARGB
+runbenchmark1 ConvertToI420
+runbenchmark1 CopyPlane
+runbenchmark1 H010ToAB30
+runbenchmark1 H010ToAR30
+runbenchmark1 HalfFloatPlane
+runbenchmark1 I010ToAB30
+runbenchmark1 I010ToAR30
+runbenchmark1 I420Copy
+runbenchmark1 I420Psnr
+runbenchmark1 I420Scale
+runbenchmark1 I420Ssim
+runbenchmark1 I420ToARGB
+runbenchmark1 I420ToNV12
+runbenchmark1 I420ToUYVY
+runbenchmark1 I422ToI420
+runbenchmark1 InitCpuFlags
+runbenchmark1 J420ToARGB
+runbenchmark1 NV12ToARGB
+runbenchmark1 NV12ToI420
+runbenchmark1 NV12ToI420Rotate
+runbenchmark1 SetCpuFlags
+runbenchmark1 YUY2ToI420
diff --git a/files/source/video_common.cc b/source/video_common.cc
index 92384c05..92384c05 100644
--- a/files/source/video_common.cc
+++ b/source/video_common.cc
diff --git a/tools_libyuv/OWNERS b/tools_libyuv/OWNERS
new file mode 100644
index 00000000..aae4fb6e
--- /dev/null
+++ b/tools_libyuv/OWNERS
@@ -0,0 +1,4 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
+
diff --git a/tools_libyuv/autoroller/roll_deps.py b/tools_libyuv/autoroller/roll_deps.py
new file mode 100755
index 00000000..d5c1089f
--- /dev/null
+++ b/tools_libyuv/autoroller/roll_deps.py
@@ -0,0 +1,822 @@
+#!/usr/bin/env vpython3
+
+# Copyright (c) 2017 The LibYUV project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+"""Script to automatically roll dependencies in the LibYUV DEPS file."""
+
+
+import argparse
+import base64
+import collections
+import logging
+import os
+import re
+import subprocess
+import sys
+import urllib.request
+
+
+def FindSrcDirPath():
+ """Returns the abs path to the src/ dir of the project."""
+ src_dir = os.path.dirname(os.path.abspath(__file__))
+ while os.path.basename(src_dir) != 'src':
+ src_dir = os.path.normpath(os.path.join(src_dir, os.pardir))
+ return src_dir
+
+
+# Skip these dependencies (list without solution name prefix).
+DONT_AUTOROLL_THESE = [
+ 'third_party/fuchsia-gn-sdk',
+ 'src/third_party/gflags/src',
+ 'src/third_party/mockito/src',
+]
+
+# These dependencies are missing in chromium/src/DEPS, either unused or already
+# in-tree. For instance, src/base is a part of the Chromium source git repo,
+# but we pull it through a subtree mirror, so therefore it isn't listed in
+# Chromium's deps but it is in ours.
+LIBYUV_ONLY_DEPS = [
+ 'src/base',
+ 'src/build',
+ 'src/buildtools',
+ 'src/ios',
+ 'src/testing',
+ 'src/third_party',
+ 'src/third_party/android_support_test_runner',
+ 'src/third_party/bazel',
+ 'src/third_party/bouncycastle',
+ 'src/third_party/errorprone/lib',
+ 'src/third_party/findbugs',
+ 'src/third_party/gson',
+ 'src/third_party/gtest-parallel',
+ 'src/third_party/guava',
+ 'src/third_party/intellij',
+ 'src/third_party/jsr-305/src',
+ 'src/third_party/ow2_asm',
+ 'src/third_party/proguard',
+ 'src/third_party/ub-uiautomator/lib',
+ 'src/tools',
+ 'src/tools/clang/dsymutil',
+]
+
+LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
+CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
+CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
+CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
+CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
+
+COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([-0-9a-z]+)\'$')
+ROLL_BRANCH_NAME = 'roll_chromium_revision'
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKOUT_SRC_DIR = FindSrcDirPath()
+CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
+
+# Copied from tools/android/roll/android_deps/.../BuildConfigGenerator.groovy.
+ANDROID_DEPS_START = r'=== ANDROID_DEPS Generated Code Start ==='
+ANDROID_DEPS_END = r'=== ANDROID_DEPS Generated Code End ==='
+# Location of automically gathered android deps.
+ANDROID_DEPS_PATH = 'src/third_party/android_deps/'
+
+sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
+import find_depot_tools
+
+find_depot_tools.add_depot_tools_to_path()
+
+CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
+CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
+ 'clang', 'scripts', 'update.py')
+
+DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
+ChangedDep = collections.namedtuple('ChangedDep',
+ 'path url current_rev new_rev')
+CipdDepsEntry = collections.namedtuple('CipdDepsEntry', 'path packages')
+VersionEntry = collections.namedtuple('VersionEntry', 'version')
+ChangedCipdPackage = collections.namedtuple(
+ 'ChangedCipdPackage', 'path package current_version new_version')
+ChangedVersionEntry = collections.namedtuple(
+ 'ChangedVersionEntry', 'path current_version new_version')
+
+ChromiumRevisionUpdate = collections.namedtuple('ChromiumRevisionUpdate',
+ ('current_chromium_rev '
+ 'new_chromium_rev '))
+
+
+class RollError(Exception):
+ pass
+
+
+def StrExpansion():
+ return lambda str_value: str_value
+
+
+def VarLookup(local_scope):
+ return lambda var_name: local_scope['vars'][var_name]
+
+
+def ParseDepsDict(deps_content):
+ local_scope = {}
+ global_scope = {
+ 'Str': StrExpansion(),
+ 'Var': VarLookup(local_scope),
+ 'deps_os': {},
+ }
+ exec(deps_content, global_scope, local_scope)
+ return local_scope
+
+
+def ParseLocalDepsFile(filename):
+ with open(filename, 'rb') as f:
+ deps_content = f.read().decode('utf-8')
+ return ParseDepsDict(deps_content)
+
+
+def ParseCommitPosition(commit_message):
+ for line in reversed(commit_message.splitlines()):
+ m = COMMIT_POSITION_RE.match(line.strip())
+ if m:
+ return int(m.group(1))
+ logging.error('Failed to parse commit position id from:\n%s\n',
+ commit_message)
+ sys.exit(-1)
+
+
+def _RunCommand(command,
+ working_dir=None,
+ ignore_exit_code=False,
+ extra_env=None,
+ input_data=None):
+ """Runs a command and returns the output from that command.
+
+ If the command fails (exit code != 0), the function will exit the process.
+
+ Returns:
+ A tuple containing the stdout and stderr outputs as strings.
+ """
+ working_dir = working_dir or CHECKOUT_SRC_DIR
+ logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
+ env = os.environ.copy()
+ if extra_env:
+ assert all(isinstance(value, str) for value in extra_env.values())
+ logging.debug('extra env: %s', extra_env)
+ env.update(extra_env)
+ p = subprocess.Popen(command,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=env,
+ cwd=working_dir,
+ universal_newlines=True)
+ std_output, err_output = p.communicate(input_data)
+ p.stdout.close()
+ p.stderr.close()
+ if not ignore_exit_code and p.returncode != 0:
+ logging.error('Command failed: %s\n'
+ 'stdout:\n%s\n'
+ 'stderr:\n%s\n', ' '.join(command), std_output, err_output)
+ sys.exit(p.returncode)
+ return std_output, err_output
+
+
+def _GetBranches():
+ """Returns a tuple of active,branches.
+
+ The 'active' is the name of the currently active branch and 'branches' is a
+ list of all branches.
+ """
+ lines = _RunCommand(['git', 'branch'])[0].split('\n')
+ branches = []
+ active = ''
+ for line in lines:
+ if '*' in line:
+ # The assumption is that the first char will always be the '*'.
+ active = line[1:].strip()
+ branches.append(active)
+ else:
+ branch = line.strip()
+ if branch:
+ branches.append(branch)
+ return active, branches
+
+
+def _ReadGitilesContent(url):
+ # Download and decode BASE64 content until
+ # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
+ base64_content = ReadUrlContent(url + '?format=TEXT')
+ return base64.b64decode(base64_content[0]).decode('utf-8')
+
+
+def ReadRemoteCrFile(path_below_src, revision):
+ """Reads a remote Chromium file of a specific revision.
+
+ Args:
+ path_below_src: A path to the target file relative to src dir.
+ revision: Revision to read.
+ Returns:
+ A string with file content.
+ """
+ return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE %
+ (revision, path_below_src))
+
+
+def ReadRemoteCrCommit(revision):
+ """Reads a remote Chromium commit message. Returns a string."""
+ return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
+
+
+def ReadUrlContent(url):
+ """Connect to a remote host and read the contents.
+
+ Args:
+ url: URL to connect to.
+ Returns:
+ A list of lines.
+ """
+ conn = urllib.request.urlopen(url)
+ try:
+ return conn.readlines()
+ except IOError as e:
+ logging.exception('Error connecting to %s. Error: %s', url, e)
+ raise
+ finally:
+ conn.close()
+
+
+def GetMatchingDepsEntries(depsentry_dict, dir_path):
+ """Gets all deps entries matching the provided path.
+
+ This list may contain more than one DepsEntry object.
+ Example: dir_path='src/testing' would give results containing both
+ 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's
+ DEPS.
+ Example 2: dir_path='src/build' should return 'src/build' but not
+ 'src/buildtools'.
+
+ Returns:
+ A list of DepsEntry objects.
+ """
+ result = []
+ for path, depsentry in depsentry_dict.items():
+ if path == dir_path:
+ result.append(depsentry)
+ else:
+ parts = path.split('/')
+ if all(part == parts[i] for i, part in enumerate(dir_path.split('/'))):
+ result.append(depsentry)
+ return result
+
+
+def BuildDepsentryDict(deps_dict):
+ """Builds a dict of paths to DepsEntry objects from a raw deps dict."""
+ result = {}
+
+ def AddDepsEntries(deps_subdict):
+ for path, dep in deps_subdict.items():
+ if path in result:
+ continue
+ if not isinstance(dep, dict):
+ dep = {'url': dep}
+ if dep.get('dep_type') == 'cipd':
+ result[path] = CipdDepsEntry(path, dep['packages'])
+ else:
+ if '@' not in dep['url']:
+ continue
+ url, revision = dep['url'].split('@')
+ result[path] = DepsEntry(path, url, revision)
+
+ def AddVersionEntry(vars_subdict):
+ for key, value in vars_subdict.items():
+ if key in result:
+ continue
+ if not key.endswith('_version'):
+ continue
+ key = re.sub('_version$', '', key)
+ result[key] = VersionEntry(value)
+
+ AddDepsEntries(deps_dict['deps'])
+ for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
+ AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
+ AddVersionEntry(deps_dict.get('vars', {}))
+ return result
+
+
+def _FindChangedCipdPackages(path, old_pkgs, new_pkgs):
+ old_pkgs_names = {p['package'] for p in old_pkgs}
+ new_pkgs_names = {p['package'] for p in new_pkgs}
+ pkgs_equal = (old_pkgs_names == new_pkgs_names)
+ added_pkgs = [p for p in new_pkgs_names if p not in old_pkgs_names]
+ removed_pkgs = [p for p in old_pkgs_names if p not in new_pkgs_names]
+
+ assert pkgs_equal, ('Old: %s\n New: %s.\nYou need to do a manual roll '
+ 'and remove/add entries in DEPS so the old and new '
+ 'list match.\nMost likely, you should add \"%s\" and '
+ 'remove \"%s\"' %
+ (old_pkgs, new_pkgs, added_pkgs, removed_pkgs))
+
+ for old_pkg in old_pkgs:
+ for new_pkg in new_pkgs:
+ old_version = old_pkg['version']
+ new_version = new_pkg['version']
+ if (old_pkg['package'] == new_pkg['package']
+ and old_version != new_version):
+ logging.debug('Roll dependency %s to %s', path, new_version)
+ yield ChangedCipdPackage(path, old_pkg['package'], old_version,
+ new_version)
+
+
+def _FindChangedVars(name, old_version, new_version):
+ if old_version != new_version:
+ logging.debug('Roll dependency %s to %s', name, new_version)
+ yield ChangedVersionEntry(name, old_version, new_version)
+
+
+def _FindNewDeps(old, new):
+ """ Gather dependencies only in `new` and return corresponding paths. """
+ old_entries = set(BuildDepsentryDict(old))
+ new_entries = set(BuildDepsentryDict(new))
+ return [
+ path for path in new_entries - old_entries
+ if path not in DONT_AUTOROLL_THESE
+ ]
+
+
+def FindAddedDeps(libyuv_deps, new_cr_deps):
+ """
+ Calculate new deps entries of interest.
+
+ Ideally, that would mean: only appearing in chromium DEPS
+ but transitively used in LibYUV.
+
+ Since it's hard to compute, we restrict ourselves to a well defined subset:
+ deps sitting in `ANDROID_DEPS_PATH`.
+ Otherwise, assumes that's a Chromium-only dependency.
+
+ Args:
+ libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+ new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+ Caveat: Doesn't detect a new package in existing dep.
+
+ Returns:
+ A tuple consisting of:
+ A list of paths added dependencies sitting in `ANDROID_DEPS_PATH`.
+ A list of paths for other added dependencies.
+ """
+ all_added_deps = _FindNewDeps(libyuv_deps, new_cr_deps)
+ generated_android_deps = [
+ path for path in all_added_deps if path.startswith(ANDROID_DEPS_PATH)
+ ]
+ other_deps = [
+ path for path in all_added_deps if path not in generated_android_deps
+ ]
+ return generated_android_deps, other_deps
+
+
+def FindRemovedDeps(libyuv_deps, new_cr_deps):
+ """
+ Calculate obsolete deps entries.
+
+ Ideally, that would mean: no more appearing in chromium DEPS
+ and not used in LibYUV.
+
+ Since it's hard to compute:
+ 1/ We restrict ourselves to a well defined subset:
+ deps sitting in `ANDROID_DEPS_PATH`.
+ 2/ We rely on existing behavior of CalculateChangeDeps.
+ I.e. Assumes non-CIPD dependencies are LibYUV-only, don't remove them.
+
+ Args:
+ libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+ new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+ Caveat: Doesn't detect a deleted package in existing dep.
+
+ Returns:
+ A tuple consisting of:
+ A list of paths of dependencies removed from `ANDROID_DEPS_PATH`.
+ A list of paths of unexpected disappearing dependencies.
+ """
+ all_removed_deps = _FindNewDeps(new_cr_deps, libyuv_deps)
+ generated_android_deps = sorted(
+ [path for path in all_removed_deps if path.startswith(ANDROID_DEPS_PATH)])
+ # Webrtc-only dependencies are handled in CalculateChangedDeps.
+ other_deps = sorted([
+ path for path in all_removed_deps
+ if path not in generated_android_deps and path not in LIBYUV_ONLY_DEPS
+ ])
+ return generated_android_deps, other_deps
+
+
+def CalculateChangedDeps(libyuv_deps, new_cr_deps):
+ """
+ Calculate changed deps entries based on entries defined in the LibYUV DEPS
+ file:
+ - If a shared dependency with the Chromium DEPS file: roll it to the same
+ revision as Chromium (i.e. entry in the new_cr_deps dict)
+ - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
+ this means it may be ahead of the chromium_revision, but generally these
+ should be close).
+ - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
+ unless it's configured to be skipped.
+
+ Returns:
+ A list of ChangedDep objects representing the changed deps.
+ """
+ result = []
+ libyuv_entries = BuildDepsentryDict(libyuv_deps)
+ new_cr_entries = BuildDepsentryDict(new_cr_deps)
+ for path, libyuv_deps_entry in libyuv_entries.items():
+ if path in DONT_AUTOROLL_THESE:
+ continue
+ cr_deps_entry = new_cr_entries.get(path)
+ if cr_deps_entry:
+ assert type(cr_deps_entry) is type(libyuv_deps_entry)
+
+ if isinstance(cr_deps_entry, CipdDepsEntry):
+ result.extend(
+ _FindChangedCipdPackages(path, libyuv_deps_entry.packages,
+ cr_deps_entry.packages))
+ continue
+
+ if isinstance(cr_deps_entry, VersionEntry):
+ result.extend(
+ _FindChangedVars(path, libyuv_deps_entry.version,
+ cr_deps_entry.version))
+ continue
+
+ # Use the revision from Chromium's DEPS file.
+ new_rev = cr_deps_entry.revision
+ assert libyuv_deps_entry.url == cr_deps_entry.url, (
+ 'LibYUV DEPS entry %s has a different URL %s than Chromium %s.' %
+ (path, libyuv_deps_entry.url, cr_deps_entry.url))
+ else:
+ if isinstance(libyuv_deps_entry, DepsEntry):
+ # Use the HEAD of the deps repo.
+ stdout, _ = _RunCommand(
+ ['git', 'ls-remote', libyuv_deps_entry.url, 'HEAD'])
+ new_rev = stdout.strip().split('\t')[0]
+ else:
+ # The dependency has been removed from chromium.
+ # This is handled by FindRemovedDeps.
+ continue
+
+ # Check if an update is necessary.
+ if libyuv_deps_entry.revision != new_rev:
+ logging.debug('Roll dependency %s to %s', path, new_rev)
+ result.append(
+ ChangedDep(path, libyuv_deps_entry.url, libyuv_deps_entry.revision,
+ new_rev))
+ return sorted(result)
+
+
+def CalculateChangedClang(new_cr_rev):
+
+ def GetClangRev(lines):
+ for line in lines:
+ match = CLANG_REVISION_RE.match(line)
+ if match:
+ return match.group(1)
+ raise RollError('Could not parse Clang revision!')
+
+ with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'r') as f:
+ current_lines = f.readlines()
+ current_rev = GetClangRev(current_lines)
+
+ new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
+ new_cr_rev).splitlines()
+ new_rev = GetClangRev(new_clang_update_py)
+ return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
+
+
+def GenerateCommitMessage(
+ rev_update,
+ current_commit_pos,
+ new_commit_pos,
+ changed_deps_list,
+ added_deps_paths=None,
+ removed_deps_paths=None,
+ clang_change=None,
+):
+ current_cr_rev = rev_update.current_chromium_rev[0:10]
+ new_cr_rev = rev_update.new_chromium_rev[0:10]
+ rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
+ git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
+
+ commit_msg = [
+ 'Roll chromium_revision %s (%s)\n' % (rev_interval, git_number_interval),
+ 'Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval),
+ 'Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % rev_interval)
+ ]
+
+ def Section(adjective, deps):
+ noun = 'dependency' if len(deps) == 1 else 'dependencies'
+ commit_msg.append('%s %s' % (adjective, noun))
+
+ if changed_deps_list:
+ Section('Changed', changed_deps_list)
+
+ for c in changed_deps_list:
+ if isinstance(c, ChangedCipdPackage):
+ commit_msg.append('* %s: %s..%s' %
+ (c.path, c.current_version, c.new_version))
+ elif isinstance(c, ChangedVersionEntry):
+ commit_msg.append('* %s_vesion: %s..%s' %
+ (c.path, c.current_version, c.new_version))
+ else:
+ commit_msg.append('* %s: %s/+log/%s..%s' %
+ (c.path, c.url, c.current_rev[0:10], c.new_rev[0:10]))
+
+ if added_deps_paths:
+ Section('Added', added_deps_paths)
+ commit_msg.extend('* %s' % p for p in added_deps_paths)
+
+ if removed_deps_paths:
+ Section('Removed', removed_deps_paths)
+ commit_msg.extend('* %s' % p for p in removed_deps_paths)
+
+ if any([changed_deps_list, added_deps_paths, removed_deps_paths]):
+ change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
+ commit_msg.append('DEPS diff: %s\n' % change_url)
+ else:
+ commit_msg.append('No dependencies changed.')
+
+ if clang_change and clang_change.current_rev != clang_change.new_rev:
+ commit_msg.append('Clang version changed %s:%s' %
+ (clang_change.current_rev, clang_change.new_rev))
+ change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
+ CLANG_UPDATE_SCRIPT_URL_PATH)
+ commit_msg.append('Details: %s\n' % change_url)
+ else:
+ commit_msg.append('No update to Clang.\n')
+
+ commit_msg.append('BUG=None')
+ return '\n'.join(commit_msg)
+
+
+def UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content):
+ """Update the DEPS file with the new revision."""
+
+ with open(deps_filename, 'rb') as deps_file:
+ deps_content = deps_file.read().decode('utf-8')
+
+ # Update the chromium_revision variable.
+ deps_content = deps_content.replace(rev_update.current_chromium_rev,
+ rev_update.new_chromium_rev)
+
+ # Add and remove dependencies. For now: only generated android deps.
+ # Since gclient cannot add or remove deps, we on the fact that
+ # these android deps are located in one place we can copy/paste.
+ deps_re = re.compile(ANDROID_DEPS_START + '.*' + ANDROID_DEPS_END, re.DOTALL)
+ new_deps = deps_re.search(new_cr_content)
+ old_deps = deps_re.search(deps_content)
+ if not new_deps or not old_deps:
+ faulty = 'Chromium' if not new_deps else 'LibYUV'
+ raise RollError('Was expecting to find "%s" and "%s"\n'
+ 'in %s DEPS' %
+ (ANDROID_DEPS_START, ANDROID_DEPS_END, faulty))
+ deps_content = deps_re.sub(new_deps.group(0), deps_content)
+
+ for dep in changed_deps:
+ if isinstance(dep, ChangedVersionEntry):
+ deps_content = deps_content.replace(dep.current_version, dep.new_version)
+
+ with open(deps_filename, 'wb') as deps_file:
+ deps_file.write(deps_content.encode('utf-8'))
+
+ # Update each individual DEPS entry.
+ for dep in changed_deps:
+ # ChangedVersionEntry types are already been processed.
+ if isinstance(dep, ChangedVersionEntry):
+ continue
+ local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
+ if not os.path.isdir(local_dep_dir):
+ raise RollError(
+ 'Cannot find local directory %s. Either run\n'
+ 'gclient sync --deps=all\n'
+ 'or make sure the .gclient file for your solution contains all '
+ 'platforms in the target_os list, i.e.\n'
+ 'target_os = ["android", "unix", "mac", "ios", "win"];\n'
+ 'Then run "gclient sync" again.' % local_dep_dir)
+ if isinstance(dep, ChangedCipdPackage):
+ package = dep.package.format() # Eliminate double curly brackets
+ update = '%s:%s@%s' % (dep.path, package, dep.new_version)
+ else:
+ update = '%s@%s' % (dep.path, dep.new_rev)
+ _RunCommand(['gclient', 'setdep', '--revision', update],
+ working_dir=CHECKOUT_SRC_DIR)
+
+
+def _IsTreeClean():
+ stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
+ if len(stdout) == 0:
+ return True
+
+ logging.error('Dirty/unversioned files:\n%s', stdout)
+ return False
+
+
+def _EnsureUpdatedMainBranch(dry_run):
+ current_branch = _RunCommand(['git', 'rev-parse', '--abbrev-ref',
+ 'HEAD'])[0].splitlines()[0]
+ if current_branch != 'main':
+ logging.error('Please checkout the main branch and re-run this script.')
+ if not dry_run:
+ sys.exit(-1)
+
+ logging.info('Updating main branch...')
+ _RunCommand(['git', 'pull'])
+
+
+def _CreateRollBranch(dry_run):
+ logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
+ if not dry_run:
+ _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
+
+
+def _RemovePreviousRollBranch(dry_run):
+ active_branch, branches = _GetBranches()
+ if active_branch == ROLL_BRANCH_NAME:
+ active_branch = 'main'
+ if ROLL_BRANCH_NAME in branches:
+ logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
+ if not dry_run:
+ _RunCommand(['git', 'checkout', active_branch])
+ _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
+
+
+def _LocalCommit(commit_msg, dry_run):
+ logging.info('Committing changes locally.')
+ if not dry_run:
+ _RunCommand(['git', 'add', '--update', '.'])
+ _RunCommand(['git', 'commit', '-m', commit_msg])
+
+
+def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
+ if skip_cq:
+ return 0
+ if (new_commit_pos - current_commit_pos) < cq_over:
+ return 1
+ return 2
+
+
+def _GetCcRecipients(changed_deps_list):
+ """Returns a list of emails to notify based on the changed deps list.
+ """
+ cc_recipients = []
+ for c in changed_deps_list:
+ pass
+ return cc_recipients
+
+
+def _UploadCL(commit_queue_mode, add_cc=None):
+ """Upload the committed changes as a changelist to Gerrit.
+
+ commit_queue_mode:
+ - 2: Submit to commit queue.
+ - 1: Run trybots but do not submit to CQ.
+ - 0: Skip CQ, upload only.
+
+ add_cc: A list of email addresses to add as CC recipients.
+ """
+ cc_recipients = []
+ if add_cc:
+ cc_recipients.extend(add_cc)
+ cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks']
+ if commit_queue_mode >= 2:
+ logging.info('Sending the CL to the CQ...')
+ cmd.extend(['-o', 'label=Bot-Commit+1'])
+ cmd.extend(['-o', 'label=Commit-Queue+2'])
+ cmd.extend(['--send-mail', '--cc', ','.join(cc_recipients)])
+ elif commit_queue_mode >= 1:
+ logging.info('Starting CQ dry run...')
+ cmd.extend(['-o', 'label=Commit-Queue+1'])
+ extra_env = {
+ 'EDITOR': 'true',
+ 'SKIP_GCE_AUTH_FOR_GIT': '1',
+ }
+ stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
+ logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
+ stdout, stderr)
+
+
+def GetRollRevisionRanges(opts, libyuv_deps):
+ current_cr_rev = libyuv_deps['vars']['chromium_revision']
+ new_cr_rev = opts.revision
+ if not new_cr_rev:
+ stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
+ head_rev = stdout.strip().split('\t')[0]
+ logging.info('No revision specified. Using HEAD: %s', head_rev)
+ new_cr_rev = head_rev
+
+ return ChromiumRevisionUpdate(current_cr_rev, new_cr_rev)
+
+
+def main():
+ p = argparse.ArgumentParser()
+ p.add_argument('--clean',
+ action='store_true',
+ default=False,
+ help='Removes any previous local roll branch.')
+ p.add_argument('-r',
+ '--revision',
+ help=('Chromium Git revision to roll to. Defaults to the '
+ 'Chromium HEAD revision if omitted.'))
+ p.add_argument('--dry-run',
+ action='store_true',
+ default=False,
+ help=('Calculate changes and modify DEPS, but don\'t create '
+ 'any local branch, commit, upload CL or send any '
+ 'tryjobs.'))
+ p.add_argument('-i',
+ '--ignore-unclean-workdir',
+ action='store_true',
+ default=False,
+ help=('Ignore if the current branch is not main or if there '
+ 'are uncommitted changes (default: %(default)s).'))
+ grp = p.add_mutually_exclusive_group()
+ grp.add_argument('--skip-cq',
+ action='store_true',
+ default=False,
+ help='Skip sending the CL to the CQ (default: %(default)s)')
+ grp.add_argument('--cq-over',
+ type=int,
+ default=1,
+ help=('Commit queue dry run if the revision difference '
+ 'is below this number (default: %(default)s)'))
+ p.add_argument('-v',
+ '--verbose',
+ action='store_true',
+ default=False,
+ help='Be extra verbose in printing of log messages.')
+ opts = p.parse_args()
+
+ if opts.verbose:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ if not opts.ignore_unclean_workdir and not _IsTreeClean():
+ logging.error('Please clean your local checkout first.')
+ return 1
+
+ if opts.clean:
+ _RemovePreviousRollBranch(opts.dry_run)
+
+ if not opts.ignore_unclean_workdir:
+ _EnsureUpdatedMainBranch(opts.dry_run)
+
+ deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
+ libyuv_deps = ParseLocalDepsFile(deps_filename)
+
+ rev_update = GetRollRevisionRanges(opts, libyuv_deps)
+
+ current_commit_pos = ParseCommitPosition(
+ ReadRemoteCrCommit(rev_update.current_chromium_rev))
+ new_commit_pos = ParseCommitPosition(
+ ReadRemoteCrCommit(rev_update.new_chromium_rev))
+
+ new_cr_content = ReadRemoteCrFile('DEPS', rev_update.new_chromium_rev)
+ new_cr_deps = ParseDepsDict(new_cr_content)
+ changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+ # Discard other deps, assumed to be chromium-only dependencies.
+ new_generated_android_deps, _ = FindAddedDeps(libyuv_deps, new_cr_deps)
+ removed_generated_android_deps, other_deps = FindRemovedDeps(
+ libyuv_deps, new_cr_deps)
+ if other_deps:
+ raise RollError('LibYUV DEPS entries are missing from Chromium: %s.\n'
+ 'Remove them or add them to either '
+ 'LIBYUV_ONLY_DEPS or DONT_AUTOROLL_THESE.' % other_deps)
+ clang_change = CalculateChangedClang(rev_update.new_chromium_rev)
+ commit_msg = GenerateCommitMessage(
+ rev_update,
+ current_commit_pos,
+ new_commit_pos,
+ changed_deps,
+ added_deps_paths=new_generated_android_deps,
+ removed_deps_paths=removed_generated_android_deps,
+ clang_change=clang_change)
+ logging.debug('Commit message:\n%s', commit_msg)
+
+ _CreateRollBranch(opts.dry_run)
+ if not opts.dry_run:
+ UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content)
+ if _IsTreeClean():
+ logging.info("No DEPS changes detected, skipping CL creation.")
+ else:
+ _LocalCommit(commit_msg, opts.dry_run)
+ commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+ current_commit_pos, new_commit_pos)
+ logging.info('Uploading CL...')
+ if not opts.dry_run:
+ _UploadCL(commit_queue_mode, _GetCcRecipients(changed_deps))
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py b/tools_libyuv/autoroller/unittests/roll_deps_test.py
index 477b6e40..af86bdd5 100755
--- a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
+++ b/tools_libyuv/autoroller/unittests/roll_deps_test.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env vpython3
+
# Copyright 2017 The LibYuv Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style license
@@ -14,14 +15,13 @@ import sys
import tempfile
import unittest
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir)
-sys.path.append(PARENT_DIR)
import roll_deps
from roll_deps import CalculateChangedDeps, GetMatchingDepsEntries, \
ParseDepsDict, ParseLocalDepsFile, UpdateDepsFile
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir)
+sys.path.append(PARENT_DIR)
TEST_DATA_VARS = {
'chromium_git': 'https://chromium.googlesource.com',
@@ -45,7 +45,7 @@ class TestError(Exception):
pass
-class FakeCmd(object):
+class FakeCmd():
def __init__(self):
self.expectations = []
@@ -85,43 +85,43 @@ class TestRollChromiumRevision(unittest.TestCase):
def testVarLookup(self):
local_scope = {'foo': 'wrong', 'vars': {'foo': 'bar'}}
lookup = roll_deps.VarLookup(local_scope)
- self.assertEquals(lookup('foo'), 'bar')
+ self.assertEqual(lookup('foo'), 'bar')
def testUpdateDepsFile(self):
new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111'
current_rev = TEST_DATA_VARS['chromium_revision']
UpdateDepsFile(self._libyuv_depsfile, current_rev, new_rev, [])
- with open(self._libyuv_depsfile) as deps_file:
+ with open(self._libyuv_depsfile, 'r') as deps_file:
deps_contents = deps_file.read()
self.assertTrue(new_rev in deps_contents,
'Failed to find %s in\n%s' % (new_rev, deps_contents))
def testParseDepsDict(self):
- with open(self._libyuv_depsfile) as deps_file:
+ with open(self._libyuv_depsfile, 'r') as deps_file:
deps_contents = deps_file.read()
local_scope = ParseDepsDict(deps_contents)
vars_dict = local_scope['vars']
def assertVar(variable_name):
- self.assertEquals(vars_dict[variable_name], TEST_DATA_VARS[variable_name])
+ self.assertEqual(vars_dict[variable_name], TEST_DATA_VARS[variable_name])
assertVar('chromium_git')
assertVar('chromium_revision')
- self.assertEquals(len(local_scope['deps']), 3)
+ self.assertEqual(len(local_scope['deps']), 3)
def testGetMatchingDepsEntriesReturnsPathInSimpleCase(self):
entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing/gtest')
- self.assertEquals(len(entries), 1)
- self.assertEquals(entries[0], DEPS_ENTRIES['src/testing/gtest'])
+ self.assertEqual(len(entries), 1)
+ self.assertEqual(entries[0], DEPS_ENTRIES['src/testing/gtest'])
def testGetMatchingDepsEntriesHandlesSimilarStartingPaths(self):
entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing')
- self.assertEquals(len(entries), 2)
+ self.assertEqual(len(entries), 2)
def testGetMatchingDepsEntriesHandlesTwoPathsWithIdenticalFirstParts(self):
entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/build')
- self.assertEquals(len(entries), 1)
- self.assertEquals(entries[0], DEPS_ENTRIES['src/build'])
+ self.assertEqual(len(entries), 1)
+ self.assertEqual(entries[0], DEPS_ENTRIES['src/build'])
def testCalculateChangedDeps(self):
_SetupGitLsRemoteCall(self.fake,
@@ -129,14 +129,14 @@ class TestRollChromiumRevision(unittest.TestCase):
libyuv_deps = ParseLocalDepsFile(self._libyuv_depsfile)
new_cr_deps = ParseLocalDepsFile(self._new_cr_depsfile)
changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
- self.assertEquals(len(changed_deps), 2)
- self.assertEquals(changed_deps[0].path, 'src/build')
- self.assertEquals(changed_deps[0].current_rev, BUILD_OLD_REV)
- self.assertEquals(changed_deps[0].new_rev, BUILD_NEW_REV)
-
- self.assertEquals(changed_deps[1].path, 'src/buildtools')
- self.assertEquals(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV)
- self.assertEquals(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV)
+ self.assertEqual(len(changed_deps), 2)
+ self.assertEqual(changed_deps[0].path, 'src/build')
+ self.assertEqual(changed_deps[0].current_rev, BUILD_OLD_REV)
+ self.assertEqual(changed_deps[0].new_rev, BUILD_NEW_REV)
+
+ self.assertEqual(changed_deps[1].path, 'src/buildtools')
+ self.assertEqual(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV)
+ self.assertEqual(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV)
def _SetupGitLsRemoteCall(cmd_fake, url, revision):
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS b/tools_libyuv/autoroller/unittests/testdata/DEPS
index 9fbb48a7..4f45860c 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS
@@ -3,6 +3,7 @@
vars = {
'chromium_git': 'https://chromium.googlesource.com',
'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d',
+ 'ignored_str': Str(''),
}
deps = {
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
index d53083ce..d53083ce 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
index dd6ddaec..dd6ddaec 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
diff --git a/files/tools_libyuv/get_landmines.py b/tools_libyuv/get_landmines.py
index c554f04a..8b33483e 100755
--- a/files/tools_libyuv/get_landmines.py
+++ b/tools_libyuv/get_landmines.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
# Copyright 2016 The LibYuv Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style license
@@ -25,8 +26,8 @@ def print_landmines():
# dependency problems, fix the dependency problems instead of adding a
# landmine.
# See the Chromium version in src/build/get_landmines.py for usage examples.
- print 'Clobber to remove GYP artifacts after switching bots to GN.'
- print 'Another try to remove GYP artifacts after switching bots to GN.'
+ print('Clobber to remove GYP artifacts after switching bots to GN.')
+ print('Another try to remove GYP artifacts after switching bots to GN.')
def main():
diff --git a/tools_libyuv/msan/OWNERS b/tools_libyuv/msan/OWNERS
new file mode 100644
index 00000000..9b67a8f6
--- /dev/null
+++ b/tools_libyuv/msan/OWNERS
@@ -0,0 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
diff --git a/files/tools_libyuv/msan/blacklist.txt b/tools_libyuv/msan/blacklist.txt
index 8b5e42a7..8b5e42a7 100644
--- a/files/tools_libyuv/msan/blacklist.txt
+++ b/tools_libyuv/msan/blacklist.txt
diff --git a/tools_libyuv/ubsan/OWNERS b/tools_libyuv/ubsan/OWNERS
new file mode 100644
index 00000000..9b67a8f6
--- /dev/null
+++ b/tools_libyuv/ubsan/OWNERS
@@ -0,0 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
diff --git a/files/tools_libyuv/ubsan/blacklist.txt b/tools_libyuv/ubsan/blacklist.txt
index 8bcb2907..8bcb2907 100644
--- a/files/tools_libyuv/ubsan/blacklist.txt
+++ b/tools_libyuv/ubsan/blacklist.txt
diff --git a/files/tools_libyuv/ubsan/vptr_blacklist.txt b/tools_libyuv/ubsan/vptr_blacklist.txt
index 23cfca53..23cfca53 100644
--- a/files/tools_libyuv/ubsan/vptr_blacklist.txt
+++ b/tools_libyuv/ubsan/vptr_blacklist.txt
diff --git a/files/unit_test/basictypes_test.cc b/unit_test/basictypes_test.cc
index 9aaa2dcd..9aaa2dcd 100644
--- a/files/unit_test/basictypes_test.cc
+++ b/unit_test/basictypes_test.cc
diff --git a/files/unit_test/color_test.cc b/unit_test/color_test.cc
index 4bb448d5..01267ff1 100644
--- a/files/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -20,20 +20,22 @@
namespace libyuv {
-// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define ERROR_R 1
-#define ERROR_G 1
-#define ERROR_B 3
-#define ERROR_FULL 6
-#define ERROR_J420 5
+// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
+// Port to Visual C and other CPUs
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__))
+#define ERROR_FULL 5
+#define ERROR_J420 4
#else
+#define ERROR_FULL 6
+#define ERROR_J420 6
+#endif
#define ERROR_R 1
#define ERROR_G 1
-#define ERROR_B 3
-#define ERROR_FULL 5
-#define ERROR_J420 3
+#ifdef LIBYUV_UNLIMITED_DATA
+#define ERROR_B 1
+#else
+#define ERROR_B 18
#endif
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
@@ -187,6 +189,104 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
*r = orig_pixels[2];
}
+static void YUVHToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ H422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+#define F422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+
+static void YUVFToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ F422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ U422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
static void YToRGB(int y, int* r, int* g, int* b) {
const int kWidth = 16;
const int kHeight = 1;
@@ -335,18 +435,50 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
EXPECT_LE(allb, 255);
}
+// BT.601 limited range YUV to RGB reference
static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
*g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
*b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
}
+// BT.601 full range YUV to RGB reference (aka JPEG)
static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte(y - (v - 128) * -1.40200);
*g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
*b = RoundToByte(y - (u - 128) * -1.77200);
}
+// BT.709 limited range YUV to RGB reference
+// See also http://www.equasys.de/colorconversion.html
+static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
+ *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533);
+ *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112);
+}
+
+// BT.709 full range YUV to RGB reference
+static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte(y - (v - 128) * -1.5748);
+ *g = RoundToByte(y - (u - 128) * 0.18732 - (v - 128) * 0.46812);
+ *b = RoundToByte(y - (u - 128) * -1.8556);
+}
+
+// BT.2020 limited range YUV to RGB reference
+static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
+ *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
+ (v - 128) * 0.65042);
+ *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
+}
+
+// BT.2020 full range YUV to RGB reference
+static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte(y + (v - 128) * 1.474600);
+ *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
+ *b = RoundToByte(y + (u - 128) * 1.881400);
+}
+
TEST_F(LibYUVColorTest, TestYUV) {
int r0, g0, b0, r1, g1, b1;
@@ -370,7 +502,11 @@ TEST_F(LibYUVColorTest, TestYUV) {
YUVToRGB(240, 0, 0, &r1, &g1, &b1);
EXPECT_EQ(57, r1);
EXPECT_EQ(255, g1);
+#ifdef LIBYUV_UNLIMITED_DATA
+ EXPECT_EQ(3, b1);
+#else
EXPECT_EQ(5, b1);
+#endif
for (int i = 0; i < 256; ++i) {
YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
@@ -444,28 +580,28 @@ TEST_F(LibYUVColorTest, TestGreyYUV) {
static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
int i;
- printf("hist");
+ printf("hist ");
for (i = 0; i < 256; ++i) {
if (rh[i] || gh[i] || bh[i]) {
- printf("\t%8d", i - 128);
+ printf(" %8d", i - 128);
}
}
- printf("\nred");
+ printf("\nred ");
for (i = 0; i < 256; ++i) {
if (rh[i] || gh[i] || bh[i]) {
- printf("\t%8d", rh[i]);
+ printf(" %8d", rh[i]);
}
}
printf("\ngreen");
for (i = 0; i < 256; ++i) {
if (rh[i] || gh[i] || bh[i]) {
- printf("\t%8d", gh[i]);
+ printf(" %8d", gh[i]);
}
}
- printf("\nblue");
+ printf("\nblue ");
for (i = 0; i < 256; ++i) {
if (rh[i] || gh[i] || bh[i]) {
- printf("\t%8d", bh[i]);
+ printf(" %8d", bh[i]);
}
}
printf("\n");
@@ -473,7 +609,13 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
// Step by 5 on inner loop goes from 0 to 255 inclusive.
// Set to 1 for better converage. 3, 5 or 17 for faster testing.
+#ifdef DISABLE_SLOW_TESTS
#define FASTSTEP 5
+#else
+#define FASTSTEP 1
+#endif
+
+// BT.601 limited range.
TEST_F(LibYUVColorTest, TestFullYUV) {
int rh[256] = {
0,
@@ -503,6 +645,7 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
PrintHistogram(rh, gh, bh);
}
+// BT.601 full range.
TEST_F(LibYUVColorTest, TestFullYUVJ) {
int rh[256] = {
0,
@@ -520,9 +663,129 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
int y = RANDOM256(y2);
YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
YUVJToRGB(y, u, v, &r1, &g1, &b1);
- EXPECT_NEAR(r0, r1, 1);
- EXPECT_NEAR(g0, g1, 1);
- EXPECT_NEAR(b0, b1, 1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+// BT.709 limited range.
+TEST_F(LibYUVColorTest, TestFullYUVH) {
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVHToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVHToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+// BT.709 full range.
+TEST_F(LibYUVColorTest, TestFullYUVF) {
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVFToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVFToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+// BT.2020 limited range.
+TEST_F(LibYUVColorTest, TestFullYUVU) {
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVUToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVUToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+// BT.2020 full range.
+TEST_F(LibYUVColorTest, TestFullYUVV) {
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVVToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, 2);
+ EXPECT_NEAR(b0, b1, ERROR_B);
++rh[r1 - r0 + 128];
++gh[g1 - g0 + 128];
++bh[b1 - b0 + 128];
diff --git a/files/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 136254e1..c29562cb 100644
--- a/files/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -15,10 +15,13 @@
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
-#include "libyuv/compare_row.h" /* For HammingDistance_C */
#include "libyuv/cpu_id.h"
#include "libyuv/video_common.h"
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
+#endif
+
namespace libyuv {
// hash seed of 5381 recommended.
@@ -206,6 +209,7 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
free_aligned_buffer_page_end(src_a);
}
+#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
@@ -340,7 +344,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
uint32_t h1 = 0;
- const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+ const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 63) & ~63;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 255u, kMaxWidth);
@@ -403,6 +407,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
+#endif // ENABLE_ROW_TESTS
TEST_F(LibYUVCompareTest, TestHammingDistance) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
new file mode 100644
index 00000000..aeee8a7f
--- /dev/null
+++ b/unit_test/convert_argb_test.cc
@@ -0,0 +1,2700 @@
+/*
+ * Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "../unit_test/unit_test.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__)
+#define LITTLE_ENDIAN_ONLY_TEST 1
+#endif
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBCopy
+#define ABGRToABGR ARGBCopy
+
+// subsample amount uses a divide.
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+
+#define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, W1280, N, NEG, OFF, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
+ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
+ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
+ "SRC_SUBSAMP_X unsupported"); \
+ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
+ "SRC_SUBSAMP_Y unsupported"); \
+ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
+ "DST_SUBSAMP_X unsupported"); \
+ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
+ "DST_SUBSAMP_Y unsupported"); \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
+ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
+ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
+ const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
+ const int kPaddedHeight = \
+ (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \
+ const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \
+ const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+ align_buffer_page_end( \
+ src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
+ SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
+ for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \
+ src_y_p[i] = \
+ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
+ } \
+ for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
+ src_uv_p[i] = \
+ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
+ memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
+ memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \
+ reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+ reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \
+ reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
+ reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
+ NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
+ } \
+ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \
+ EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
+ TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 2, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+#else
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
+ TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+#endif
+
+TESTBPTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOP(P010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10, 1, 1)
+TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
+
+// Provide matrix wrappers for full range bt.709
+#define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
+#define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+#define F422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
+#define F422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+#define F444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
+#define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+
+// Provide matrix wrappers for full range bt.2020
+#define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+#define I420ToARGBFilter(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+#define I422ToARGBFilter(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+#define I420ToRGB24Filter(a, b, c, d, e, f, g, h, i, j) \
+ I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+#define I422ToRGB24Filter(a, b, c, d, e, f, g, h, i, j) \
+ I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ double time0 = get_time(); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
+ kWidth, NEG kHeight); \
+ double time1 = get_time(); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
+ kStrideB, kWidth, NEG kHeight); \
+ } \
+ double time2 = get_time(); \
+ printf(" %8d us C - %8d us OPT\n", \
+ static_cast<int>((time1 - time0) * 1e6), \
+ static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_)); \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ + 1, _Any, +, 0) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Unaligned, +, 4) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0)
+#else
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0)
+#endif
+
+#if defined(ENABLE_FULL_TESTS)
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(F420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(F420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
+TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I422, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I422, 1, 1, RAW, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, RAW, 3, 3, 1)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
+TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, AB30, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1)
+#endif
+TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
+TESTPLANARTOB(I422, 2, 2, RGB24Filter, 3, 3, 1)
+#else // FULL_TESTS
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
+TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
+#endif
+
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = kWidth * BPP_B; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV * 2; ++j) { \
+ src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeight); \
+ memset(dst_argb_opt, 101, kStrideB* kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+ dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+ dst_argb_opt, kWidth * BPP_B, kWidth, \
+ NEG kHeight); \
+ } \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
+ memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
+ memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
+ FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
+ kHeight); \
+ FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
+ kHeight); \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * 4; ++j) { \
+ EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j], \
+ dst_argb32_opt[i * kWidth * 4 + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+ TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_ + 1, _Any, +, 0) \
+ TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Unaligned, +, 2) \
+ TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Opt, +, 0)
+#else
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+ TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, _Opt, +, 0)
+#endif
+
+#define JNV12ToARGB(a, b, c, d, e, f, g, h) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToARGB(a, b, c, d, e, f, g, h) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToABGR(a, b, c, d, e, f, g, h) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToABGR(a, b, c, d, e, f, g, h) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB24(a, b, c, d, e, f, g, h) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToRGB24(a, b, c, d, e, f, g, h) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToRAW(a, b, c, d, e, f, g, h) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToRAW(a, b, c, d, e, f, g, h) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB565(a, b, c, d, e, f, g, h) \
+ NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+
+TESTBPTOB(JNV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(JNV21, 2, 2, RAW, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBPTOB(JNV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
+TESTBPTOB(NV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, YUV24, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
+#define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \
+ EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, \
+ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+ align_buffer_page_end(dst_argb_opt, \
+ kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 101, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
+ kStrideB, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, \
+ (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, \
+ TYPE_B, EPP_B, STRIDE_B, HEIGHT_B) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (fastrand() & 63) + 1; \
+ const int kHeight = (fastrand() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+ align_buffer_page_end(dst_argb_c, \
+ kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+ align_buffer_page_end(dst_argb_opt, \
+ kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
+ src_argb[i] = 0xfe; \
+ } \
+ memset(dst_argb_c, 123, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 123, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c, \
+ kStrideB, kWidth, kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt, \
+ kStrideB, kWidth, kHeight); \
+ for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ } \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \
+ EPP_B, STRIDE_B, HEIGHT_B) \
+ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+ STRIDE_B, HEIGHT_B, benchmark_width_ + 1, _Any, +, 0) \
+ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+ STRIDE_B, HEIGHT_B, benchmark_width_, _Unaligned, +, 4) \
+ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+ STRIDE_B, HEIGHT_B, benchmark_width_, _Invert, -, 0) \
+ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+ STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0) \
+ TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \
+ EPP_B, STRIDE_B, HEIGHT_B)
+#else
+#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \
+ EPP_B, STRIDE_B, HEIGHT_B) \
+ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+ STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOB(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOB(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
+#endif
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
+TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+TESTATOB(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+
+// in place test
+#define TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \
+ EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, \
+ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \
+ align_buffer_page_end(dst_argb_c, \
+ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \
+ align_buffer_page_end(dst_argb_opt, \
+ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \
+ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memcpy(dst_argb_c + OFF, src_argb, \
+ kStrideA * kHeightA * (int)sizeof(TYPE_A)); \
+ memcpy(dst_argb_opt + OFF, src_argb, \
+ kStrideA * kHeightA * (int)sizeof(TYPE_A)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B((TYPE_A*)(dst_argb_c /* src */ + OFF), kStrideA, \
+ (TYPE_B*)dst_argb_c, kStrideB, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA, \
+ (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \
+ } \
+ memcpy(dst_argb_opt + OFF, src_argb, \
+ kStrideA * kHeightA * (int)sizeof(TYPE_A)); \
+ FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA, \
+ (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \
+ for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTATOA(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \
+ EPP_B, STRIDE_B, HEIGHT_B) \
+ TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+ STRIDE_B, HEIGHT_B, benchmark_width_, _Inplace, +, 0)
+
+TESTATOA(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOA(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+// TODO(fbarchard): Support in place for mirror.
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
+// TODO(fbarchard): Support in place for conversions that increase bpp.
+// TESTATOA(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+// TESTATOA(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 101, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
+ NULL, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \
+ kStrideB, NULL, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
+ STRIDE_B, HEIGHT_B) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (fastrand() & 63) + 1; \
+ const int kHeight = (fastrand() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 123, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 123, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
+ kWidth, kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \
+ NULL, kWidth, kHeight); \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ } \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_ + 1, _Any, +, 0) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, _Unaligned, +, 2) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, _Invert, -, 0) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, _Opt, +, 0) \
+ TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B)
+#else
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B) \
+ TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B)
+#endif
+
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+#endif
+
+// These conversions called twice, produce the original result.
+// e.g. endian swap twice.
+#define TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, \
+ OFF) \
+ TEST_F(LibYUVConvertTest, FMT_ATOB##_Endswap##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kStrideA = \
+ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ align_buffer_page_end(src_argb, \
+ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+ align_buffer_page_end(dst_argb_opt, \
+ kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideA* kHeightA); \
+ memset(dst_argb_opt, 101, kStrideA* kHeightA); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c, \
+ kStrideA, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_opt, \
+ kStrideA, kWidth, NEG kHeight); \
+ } \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_ATOB((TYPE_A*)dst_argb_c, kStrideA, (TYPE_A*)dst_argb_c, kStrideA, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \
+ kWidth, NEG kHeight); \
+ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
+ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \
+ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ + 1, \
+ _Any, +, 0) \
+ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \
+ _Unaligned, +, 2) \
+ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \
+ _Opt, +, 0)
+#else
+#define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \
+ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \
+ _Opt, +, 0)
+#endif
+
+TESTEND(ARGBToBGRA, uint8_t, 4, 4, 1)
+TESTEND(ARGBToABGR, uint8_t, 4, 4, 1)
+TESTEND(BGRAToARGB, uint8_t, 4, 4, 1)
+TESTEND(ABGRToARGB, uint8_t, 4, 4, 1)
+TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
+
+#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, N, NEG, OFF, ATTEN) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ src_a[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
+ dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
+ dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ } \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ + 1, _Any, +, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Unaligned, +, 2, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Invert, -, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Premult, +, 0, 1)
+#else
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#endif
+
+#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define F420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define F420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define F422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define F422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define H422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define F444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define F444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define H444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+
+#define I420AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j, \
+ &kYuvI601Constants, k, l, m, kFilterBilinear)
+#define I422AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j, \
+ &kYuvI601Constants, k, l, m, kFilterBilinear)
+
+#if defined(ENABLE_FULL_TESTS)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
+#else
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
+#endif
+
+TEST_F(LibYUVConvertTest, TestYToARGB) {
+ uint8_t y[32];
+ uint8_t expectedg[32];
+ for (int i = 0; i < 32; ++i) {
+ y[i] = i * 5 + 17;
+ expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
+ }
+ uint8_t argb[32 * 4];
+ YToARGB(y, 0, argb, 0, 32, 1);
+
+ for (int i = 0; i < 32; ++i) {
+ printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
+ argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
+ }
+ for (int i = 0; i < 32; ++i) {
+ EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
+ }
+}
+
+static const uint8_t kNoDither4x4[16] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+TEST_F(LibYUVConvertTest, TestNoDither) {
+ align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_rgb565dither,
+ benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+ ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
+ benchmark_width_, benchmark_height_);
+ ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+ benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
+ benchmark_height_);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+ EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+ }
+
+ free_aligned_buffer_page_end(src_argb);
+ free_aligned_buffer_page_end(dst_rgb565);
+ free_aligned_buffer_page_end(dst_rgb565dither);
+}
+
+// Ordered 4x4 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+TEST_F(LibYUVConvertTest, TestDither) {
+ align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_rgb565dither,
+ benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_argbdither,
+ benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
+ ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
+ benchmark_width_, benchmark_height_);
+ ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+ benchmark_width_ * 2, kDither565_4x4, benchmark_width_,
+ benchmark_height_);
+ RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_);
+ RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+ EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+ }
+ free_aligned_buffer_page_end(src_argb);
+ free_aligned_buffer_page_end(dst_rgb565);
+ free_aligned_buffer_page_end(dst_rgb565dither);
+ free_aligned_buffer_page_end(dst_argb);
+ free_aligned_buffer_page_end(dst_argbdither);
+}
+
+#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, \
+ kStrideB, NULL, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B##Dither( \
+ src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
+ dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \
+ } \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
+ kWidth * BPP_C, kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#endif
+
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
+#endif
+
+// Transitive test. A to B to C is same as A to C.
+// Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere.
+#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ W1280, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_b + OFF, kStrideB, \
+ kWidth, NEG kHeight); \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, \
+ kStrideC, kWidth, NEG kHeight); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \
+ kStrideC, kWidth, kHeight); \
+ } \
+ for (int i = 0; i < kStrideC * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#endif
+
+#if defined(ENABLE_FULL_TESTS)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RGB24, 3)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+#endif
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+#else
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+#endif
+
+// Transitive test: Compare 1 step vs 2 step conversion for YUVA to ARGB.
+// Benchmark 2 step conversion for comparison to 1 step conversion.
+#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ src_a[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ /* Convert A to B */ \
+ FMT_PLANAR##To##FMT_B( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
+ dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \
+ kStrideC, kWidth, kHeight); \
+ } \
+ /* Convert A to C */ \
+ FMT_PLANAR##To##FMT_C( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
+ dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \
+ for (int i = 0; i < kStrideC * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
+#else
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0)
+#endif
+
+#if defined(ENABLE_FULL_TESTS)
+TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(F422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(F422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+#else
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+#endif
+
+#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
+ OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##To##FMT_C##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ MemRandomize(src_argb_a + OFF, kStrideA * kHeight); \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \
+ kWidth, NEG kHeight); \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \
+ kWidth, NEG kHeight); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \
+ kStrideC, kWidth, kHeight); \
+ } \
+ for (int i = 0; i < kStrideC * kHeight; i += 4) { \
+ EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]); \
+ EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]); \
+ EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]); \
+ EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64); \
+ } \
+ free_aligned_buffer_page_end(src_argb_a); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Unaligned, +, 4, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Opt, +, 0, FMT_C, BPP_C)
+#endif
+
+// Caveat: Destination needs to be 4 bytes
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
+TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
+TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
+#endif
+
+TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
+ // 2x2 frames
+ uint32_t src[4];
+ uint32_t dst[4];
+ // some random input
+ src[0] = 0x11000000;
+ src[1] = 0x00450000;
+ src[2] = 0x00009f00;
+ src[3] = 0x000000ff;
+ // zeros on destination
+ dst[0] = 0x00000000;
+ dst[1] = 0x00000000;
+ dst[2] = 0x00000000;
+ dst[3] = 0x00000000;
+
+ int r = ConvertToARGB(reinterpret_cast<uint8_t*>(src),
+ 16, // input size
+ reinterpret_cast<uint8_t*>(dst),
+ 8, // destination stride
+ 0, // crop_x
+ 0, // crop_y
+ 2, // width
+ 2, // height
+ 2, // crop width
+ 2, // crop height
+ kRotate90, FOURCC_ARGB);
+
+ EXPECT_EQ(r, 0);
+ // 90 degrees rotation, no conversion
+ EXPECT_EQ(dst[0], src[2]);
+ EXPECT_EQ(dst[1], src[0]);
+ EXPECT_EQ(dst[2], src[3]);
+ EXPECT_EQ(dst[3], src[1]);
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
+ // ARGBToAR30Row_AVX2 expects a multiple of 8 pixels.
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+ align_buffer_page_end(src, kPixels * 4);
+ align_buffer_page_end(dst_opt, kPixels * 4);
+ align_buffer_page_end(dst_c, kPixels * 4);
+ MemRandomize(src, kPixels * 4);
+ memset(dst_opt, 0, kPixels * 4);
+ memset(dst_c, 1, kPixels * 4);
+
+ ARGBToAR30Row_C(src, dst_c, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
+ } else if (has_ssse3) {
+ ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
+ } else {
+ ARGBToAR30Row_C(src, dst_opt, kPixels);
+ }
+ }
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_opt[i], dst_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src);
+ free_aligned_buffer_page_end(dst_opt);
+ free_aligned_buffer_page_end(dst_c);
+}
+#endif // HAS_ARGBTOAR30ROW_AVX2
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
+ // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+ align_buffer_page_end(src, kPixels * 4);
+ align_buffer_page_end(dst_opt, kPixels * 4);
+ align_buffer_page_end(dst_c, kPixels * 4);
+ MemRandomize(src, kPixels * 4);
+ memset(dst_opt, 0, kPixels * 4);
+ memset(dst_c, 1, kPixels * 4);
+
+ ABGRToAR30Row_C(src, dst_c, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ ABGRToAR30Row_AVX2(src, dst_opt, kPixels);
+ } else if (has_ssse3) {
+ ABGRToAR30Row_SSSE3(src, dst_opt, kPixels);
+ } else {
+ ABGRToAR30Row_C(src, dst_opt, kPixels);
+ }
+ }
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_opt[i], dst_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src);
+ free_aligned_buffer_page_end(dst_opt);
+ free_aligned_buffer_page_end(dst_c);
+}
+#endif // HAS_ABGRTOAR30ROW_AVX2
+
+#if !defined(LEAN_TESTS)
+
+// Provide matrix wrappers for 12 bit YUV
+#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
+ I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAB30(a, b, c, d, e, f, g, h, i, j) \
+ I012ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+
+#define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define H410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define H410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define U410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define U410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define I410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+ I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define H410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+ I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define H410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define U410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+ I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+
+#define I010ToARGBFilter(a, b, c, d, e, f, g, h, i, j) \
+ I010ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+#define I010ToAR30Filter(a, b, c, d, e, f, g, h, i, j) \
+ I010ToAR30MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+#define I210ToARGBFilter(a, b, c, d, e, f, g, h, i, j) \
+ I210ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+#define I210ToAR30Filter(a, b, c, d, e, f, g, h, i, j) \
+ I210ToAR30MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+ kFilterBilinear)
+
+// TODO(fbarchard): Fix clamping issue affected by U channel.
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \
+ BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ const int kBpc = 2; \
+ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
+ align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \
+ reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \
+ } \
+ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
+ dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
+ dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \
+ BPP_B, ALIGN, YALIGN) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_ + 1, _Any, +, 0, 0) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 4, 4) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#else
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \
+ BPP_B, ALIGN, YALIGN) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#endif
+
+// These conversions are only optimized for x86
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGBFilter, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGBFilter, 4, 4, 1)
+
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30Filter, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
+#endif // LITTLE_ENDIAN_ONLY_TEST
+#endif // DISABLE_SLOW_TESTS
+
+#define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ ALIGN, YALIGN, W1280, N, NEG, OFF, ATTEN, S_DEPTH) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ const int kBpc = 2; \
+ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \
+ align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \
+ align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ reinterpret_cast<uint16_t*>(src_y + OFF)[i] = \
+ (fastrand() & ((1 << S_DEPTH) - 1)); \
+ reinterpret_cast<uint16_t*>(src_a + OFF)[i] = \
+ (fastrand() & ((1 << S_DEPTH) - 1)); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ reinterpret_cast<uint16_t*>(src_u + OFF)[i] = \
+ (fastrand() & ((1 << S_DEPTH) - 1)); \
+ reinterpret_cast<uint16_t*>(src_v + OFF)[i] = \
+ (fastrand() & ((1 << S_DEPTH) - 1)); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + OFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_a + OFF), kWidth, \
+ dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + OFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_a + OFF), kWidth, \
+ dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \
+ } \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ ALIGN, YALIGN, S_DEPTH) \
+ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \
+ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Unaligned, +, 2, 0, S_DEPTH) \
+ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \
+ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) \
+ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH)
+#else
+#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ ALIGN, YALIGN, S_DEPTH) \
+ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#endif
+
+#define I010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+ l, m)
+#define I010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+ l, m)
+#define J010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define F010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define F010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define H010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define V010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define I210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+ l, m)
+#define I210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+ l, m)
+#define J210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define F210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define F210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define H210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define V210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define I410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+ l, m)
+#define I410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+ l, m)
+#define J410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define J410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+ l, m)
+#define F410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define F410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+ l, m)
+#define H410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define H410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+ l, m)
+#define U410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define U410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+ l, m)
+#define V410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define I010AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j, \
+ &kYuvI601Constants, k, l, m, kFilterBilinear)
+#define I210AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I010AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j, \
+ &kYuvI601Constants, k, l, m, kFilterBilinear)
+
+// These conversions are only optimized for x86
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGBFilter, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
+#endif // DISABLE_SLOW_TESTS
+
+#define TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; \
+ const int kBpc = 2; \
+ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
+ align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = \
+ (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ reinterpret_cast<uint16_t*>(src_uv + SOFF)[i] = \
+ (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \
+ } \
+ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_uv + SOFF), \
+ kStrideUV, dst_argb_c + DOFF, kStrideB, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_uv + SOFF), \
+ kStrideUV, dst_argb_opt + DOFF, kStrideB, kWidth, \
+ NEG kHeight); \
+ } \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, S_DEPTH) \
+ TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+ benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \
+ TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+ benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH) \
+ TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+ benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \
+ TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+ benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#else
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, S_DEPTH) \
+ TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+ benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#endif
+
+#define P010ToARGB(a, b, c, d, e, f, g, h) \
+ P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P210ToARGB(a, b, c, d, e, f, g, h) \
+ P210ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P010ToAR30(a, b, c, d, e, f, g, h) \
+ P010ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P210ToAR30(a, b, c, d, e, f, g, h) \
+ P210ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P012ToARGB(a, b, c, d, e, f, g, h) \
+ P012ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P212ToARGB(a, b, c, d, e, f, g, h) \
+ P212ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P012ToAR30(a, b, c, d, e, f, g, h) \
+ P012ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P212ToAR30(a, b, c, d, e, f, g, h) \
+ P212ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P016ToARGB(a, b, c, d, e, f, g, h) \
+ P016ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P216ToARGB(a, b, c, d, e, f, g, h) \
+ P216ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P016ToAR30(a, b, c, d, e, f, g, h) \
+ P016ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P216ToAR30(a, b, c, d, e, f, g, h) \
+ P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P010ToARGBFilter(a, b, c, d, e, f, g, h) \
+ P010ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+ kFilterBilinear)
+#define P210ToARGBFilter(a, b, c, d, e, f, g, h) \
+ P210ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+ kFilterBilinear)
+#define P010ToAR30Filter(a, b, c, d, e, f, g, h) \
+ P010ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+ kFilterBilinear)
+#define P210ToAR30Filter(a, b, c, d, e, f, g, h) \
+ P210ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+ kFilterBilinear)
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTBP16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBP16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10)
+#endif // LITTLE_ENDIAN_ONLY_TEST
+#endif // DISABLE_SLOW_TESTS
+
+static int Clamp(int y) {
+ if (y < 0) {
+ y = 0;
+ }
+ if (y > 255) {
+ y = 255;
+ }
+ return y;
+}
+
+static int Clamp10(int y) {
+ if (y < 0) {
+ y = 0;
+ }
+ if (y > 1023) {
+ y = 1023;
+ }
+ return y;
+}
+
+// Test 8 bit YUV to 8 bit RGB
+TEST_F(LibYUVConvertTest, TestH420ToARGB) {
+ const int kSize = 256;
+ int histogram_b[256];
+ int histogram_g[256];
+ int histogram_r[256];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+ align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
+ align_buffer_page_end(argb_pixels, kSize * 4);
+ uint8_t* orig_y = orig_yuv;
+ uint8_t* orig_u = orig_y + kSize;
+ uint8_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 128; // 128 is 0.
+ orig_v[i] = 128;
+ }
+
+ H420ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b = argb_pixels[i * 4 + 0];
+ int g = argb_pixels[i * 4 + 1];
+ int r = argb_pixels[i * 4 + 2];
+ int a = argb_pixels[i * 4 + 3];
+ ++histogram_b[b];
+ ++histogram_g[g];
+ ++histogram_r[r];
+ // Reference formula for Y channel contribution in YUV to RGB conversions:
+ int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f + 0.5f));
+ EXPECT_EQ(b, expected_y);
+ EXPECT_EQ(g, expected_y);
+ EXPECT_EQ(r, expected_y);
+ EXPECT_EQ(a, 255);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(argb_pixels);
+}
+
+// Test 10 bit YUV to 8 bit RGB
+TEST_F(LibYUVConvertTest, TestH010ToARGB) {
+ const int kSize = 1024;
+ int histogram_b[1024];
+ int histogram_g[1024];
+ int histogram_r[1024];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+ align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+ align_buffer_page_end(argb_pixels, kSize * 4);
+ uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+ uint16_t* orig_u = orig_y + kSize;
+ uint16_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 512; // 512 is 0.
+ orig_v[i] = 512;
+ }
+
+ H010ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b = argb_pixels[i * 4 + 0];
+ int g = argb_pixels[i * 4 + 1];
+ int r = argb_pixels[i * 4 + 2];
+ int a = argb_pixels[i * 4 + 3];
+ ++histogram_b[b];
+ ++histogram_g[g];
+ ++histogram_r[r];
+ int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
+ EXPECT_NEAR(b, expected_y, 1);
+ EXPECT_NEAR(g, expected_y, 1);
+ EXPECT_NEAR(r, expected_y, 1);
+ EXPECT_EQ(a, 255);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(argb_pixels);
+}
+
+// Test 10 bit YUV to 10 bit RGB
+// Caveat: Result is near due to float rounding in expected
+// result.
+TEST_F(LibYUVConvertTest, TestH010ToAR30) {
+ const int kSize = 1024;
+ int histogram_b[1024];
+ int histogram_g[1024];
+ int histogram_r[1024];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+
+ align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+ align_buffer_page_end(ar30_pixels, kSize * 4);
+ uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+ uint16_t* orig_u = orig_y + kSize;
+ uint16_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 512; // 512 is 0.
+ orig_v[i] = 512;
+ }
+
+ H010ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
+ int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
+ int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
+ int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
+ ++histogram_b[b10];
+ ++histogram_g[g10];
+ ++histogram_r[r10];
+ int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f + 0.5));
+ EXPECT_NEAR(b10, expected_y, 4);
+ EXPECT_NEAR(g10, expected_y, 4);
+ EXPECT_NEAR(r10, expected_y, 4);
+ EXPECT_EQ(a2, 3);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(ar30_pixels);
+}
+
+// Test 10 bit YUV to 10 bit RGB
+// Caveat: Result is near due to float rounding in expected
+// result.
+TEST_F(LibYUVConvertTest, TestH010ToAB30) {
+ const int kSize = 1024;
+ int histogram_b[1024];
+ int histogram_g[1024];
+ int histogram_r[1024];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+
+ align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+ align_buffer_page_end(ab30_pixels, kSize * 4);
+ uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+ uint16_t* orig_u = orig_y + kSize;
+ uint16_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 512; // 512 is 0.
+ orig_v[i] = 512;
+ }
+
+ H010ToAB30(orig_y, 0, orig_u, 0, orig_v, 0, ab30_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int r10 = reinterpret_cast<uint32_t*>(ab30_pixels)[i] & 1023;
+ int g10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 10) & 1023;
+ int b10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 20) & 1023;
+ int a2 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 30) & 3;
+ ++histogram_b[b10];
+ ++histogram_g[g10];
+ ++histogram_r[r10];
+ int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
+ EXPECT_NEAR(b10, expected_y, 4);
+ EXPECT_NEAR(g10, expected_y, 4);
+ EXPECT_NEAR(r10, expected_y, 4);
+ EXPECT_EQ(a2, 3);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(ab30_pixels);
+}
+
+// Test 8 bit YUV to 10 bit RGB
+TEST_F(LibYUVConvertTest, TestH420ToAR30) {
+ const int kSize = 256;
+ const int kHistSize = 1024;
+ int histogram_b[kHistSize];
+ int histogram_g[kHistSize];
+ int histogram_r[kHistSize];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+ align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
+ align_buffer_page_end(ar30_pixels, kSize * 4);
+ uint8_t* orig_y = orig_yuv;
+ uint8_t* orig_u = orig_y + kSize;
+ uint8_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 128; // 128 is 0.
+ orig_v[i] = 128;
+ }
+
+ H420ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
+ int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
+ int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
+ int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
+ ++histogram_b[b10];
+ ++histogram_g[g10];
+ ++histogram_r[r10];
+ int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
+ EXPECT_NEAR(b10, expected_y, 4);
+ EXPECT_NEAR(g10, expected_y, 4);
+ EXPECT_NEAR(r10, expected_y, 4);
+ EXPECT_EQ(a2, 3);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kHistSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(ar30_pixels);
+}
+
+// Test I400 with jpeg matrix is same as J400
+TEST_F(LibYUVConvertTest, TestI400) {
+ const int kSize = 256;
+ align_buffer_page_end(orig_i400, kSize);
+ align_buffer_page_end(argb_pixels_i400, kSize * 4);
+ align_buffer_page_end(argb_pixels_j400, kSize * 4);
+ align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4);
+ align_buffer_page_end(argb_pixels_h709_i400, kSize * 4);
+ align_buffer_page_end(argb_pixels_2020_i400, kSize * 4);
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_i400[i] = i;
+ }
+
+ J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1);
+ I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1);
+ I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants,
+ kSize, 1);
+ I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants,
+ kSize, 1);
+ I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
+ kSize, 1);
+
+ EXPECT_EQ(0, argb_pixels_i400[0]);
+ EXPECT_EQ(0, argb_pixels_j400[0]);
+ EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
+ EXPECT_EQ(0, argb_pixels_h709_i400[0]);
+ EXPECT_EQ(0, argb_pixels_2020_i400[0]);
+ EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
+ EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
+ EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
+ EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
+ EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
+ EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
+ EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
+ EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
+ EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
+ EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
+ EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
+ EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
+
+ for (int i = 0; i < kSize * 4; ++i) {
+ if ((i & 3) == 3) {
+ EXPECT_EQ(255, argb_pixels_j400[i]);
+ } else {
+ EXPECT_EQ(i / 4, argb_pixels_j400[i]);
+ }
+ EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_i400);
+ free_aligned_buffer_page_end(argb_pixels_i400);
+ free_aligned_buffer_page_end(argb_pixels_j400);
+ free_aligned_buffer_page_end(argb_pixels_jpeg_i400);
+ free_aligned_buffer_page_end(argb_pixels_h709_i400);
+ free_aligned_buffer_page_end(argb_pixels_2020_i400);
+}
+
+// Test RGB24 to ARGB and back to RGB24
+TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
+ const int kSize = 256;
+ align_buffer_page_end(orig_rgb24, kSize * 3);
+ align_buffer_page_end(argb_pixels, kSize * 4);
+ align_buffer_page_end(dest_rgb24, kSize * 3);
+
+ // Test grey scale
+ for (int i = 0; i < kSize * 3; ++i) {
+ orig_rgb24[i] = i;
+ }
+
+ RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
+ ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
+
+ for (int i = 0; i < kSize * 3; ++i) {
+ EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_rgb24);
+ free_aligned_buffer_page_end(argb_pixels);
+ free_aligned_buffer_page_end(dest_rgb24);
+}
+
+TEST_F(LibYUVConvertTest, Test565) {
+ SIMD_ALIGNED(uint8_t orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8_t pixels565[256][2]);
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels[i][j] = i;
+ }
+ }
+ ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+ uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+ EXPECT_EQ(610919429u, checksum);
+}
+#endif // !defined(LEAN_TESTS)
+
+} // namespace libyuv
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
new file mode 100644
index 00000000..f55bace3
--- /dev/null
+++ b/unit_test/convert_test.cc
@@ -0,0 +1,2110 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "../unit_test/unit_test.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__)
+#define LITTLE_ENDIAN_ONLY_TEST 1
+#endif
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBCopy
+#define ABGRToABGR ARGBCopy
+
+// subsample amount uses a divide.
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+
+// Planar test
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
+ SRC_DEPTH) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
+ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
+ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
+ "SRC_SUBSAMP_X unsupported"); \
+ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
+ "SRC_SUBSAMP_Y unsupported"); \
+ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
+ "DST_SUBSAMP_X unsupported"); \
+ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
+ "DST_SUBSAMP_Y unsupported"); \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
+ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
+ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(src_u, \
+ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(src_v, \
+ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
+ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
+ SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \
+ SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
+ } \
+ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \
+ src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \
+ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
+ memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
+ memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+ reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
+ reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
+ NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
+ } \
+ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \
+ EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 2, SRC_DEPTH) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0, SRC_DEPTH) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#else
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#endif
+
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
+
+// Test Android 420 to I420
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ uint8_t* src_u = src_uv + OFF_U; \
+ uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
+ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+ kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \
+ dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \
+ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1, \
+ _Any, +, 0, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \
+ _Unaligned, +, 2, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+ -, 0, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+ 0, PN, OFF_U, OFF_V)
+#else
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \
+ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+ 0, PN, OFF_U, OFF_V)
+#endif
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+#undef TESTAPLANARTOP
+#undef TESTAPLANARTOPI
+
+// wrapper to keep API the same
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* /* src_u */,
+ int /* src_stride_u */,
+ const uint8_t* /* src_v */,
+ int /* src_stride_v */,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
+ dst_stride_vu, width, height);
+}
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
+ SRC_DEPTH) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
+ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
+ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
+ "SRC_SUBSAMP_X unsupported"); \
+ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
+ "SRC_SUBSAMP_Y unsupported"); \
+ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
+ "DST_SUBSAMP_X unsupported"); \
+ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
+ "DST_SUBSAMP_Y unsupported"); \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
+ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
+ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(src_u, \
+ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(src_v, \
+ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_uv_c, \
+ kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_uv_opt, \
+ kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \
+ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
+ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
+ SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \
+ SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
+ } \
+ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \
+ src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \
+ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
+ memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \
+ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
+ memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth, \
+ src_v_p, kSrcHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+ reinterpret_cast<DST_T*>(dst_uv_c), \
+ kDstHalfWidth * 2, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
+ reinterpret_cast<DST_T*>(dst_uv_opt), kDstHalfWidth * 2, kWidth, \
+ NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
+ } \
+ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) { \
+ EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \
+ SRC_DEPTH) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#else
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#endif
+
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10)
+TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
+
+#define TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, W1280, N, NEG, OFF, DOY, SRC_DEPTH, \
+ TILE_WIDTH, TILE_HEIGHT) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
+ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
+ "SRC_SUBSAMP_X unsupported"); \
+ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
+ "SRC_SUBSAMP_Y unsupported"); \
+ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
+ "DST_SUBSAMP_X unsupported"); \
+ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
+ "DST_SUBSAMP_Y unsupported"); \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
+ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
+ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
+ const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
+ const int kPaddedHeight = \
+ (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \
+ const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \
+ const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+ align_buffer_page_end( \
+ src_uv, \
+ 2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_uv_c, \
+ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_uv_opt, \
+ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
+ SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
+ SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
+ for (int i = 0; \
+ i < kPaddedWidth * kPaddedHeight * SRC_BPC / (int)sizeof(SRC_T); \
+ ++i) { \
+ src_y_p[i] = \
+ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
+ } \
+ for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2 * \
+ SRC_BPC / (int)sizeof(SRC_T); \
+ ++i) { \
+ src_uv_p[i] = \
+ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
+ memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
+ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
+ memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \
+ 2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T), \
+ DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth, \
+ reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \
+ 2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T), \
+ DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth, \
+ reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth, \
+ NEG kHeight); \
+ } \
+ if (DOY) { \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ } \
+ } \
+ } \
+ for (int i = 0; i < kDstHalfHeight; ++i) { \
+ for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \
+ EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \
+ dst_uv_opt[i * 2 * kDstHalfWidth + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
+ TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_ + 1, _Any, +, 0, 1, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 2, 1, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0, 1, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT) \
+ TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT)
+#else
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+ DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
+ TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH, \
+ TILE_HEIGHT)
+#endif
+
+TESTBPTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOBP(MT2T, uint8_t, 10 / 8, 2, 2, P010, uint16_t, 2, 2, 2, 10, 16, 32)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+ kStrideUV * 2, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ + 1, _Any, +, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 2) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+TESTATOPLANAR(ABGR, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ABGR, 4, 1, J422, 2, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
+
+#define TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_a_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_a_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_a_c, 1, kWidth* kHeight); \
+ memset(dst_y_c, 2, kWidth* kHeight); \
+ memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_a_opt, 101, kWidth* kHeight); \
+ memset(dst_y_opt, 102, kWidth* kHeight); \
+ memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \
+ dst_a_c, kWidth, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+ kStrideUV * 2, dst_a_opt, kWidth, kWidth, \
+ NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ EXPECT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_a_c); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_a_opt); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ + 1, _Any, +, 0) \
+ TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 2) \
+ TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
+
+#define TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV * 2; ++j) { \
+ EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j], \
+ dst_uv_opt[i * kStrideUV * 2 + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ + 1, _Any, +, 0) \
+ TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 2) \
+ TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOBP(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBP(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV21, 2, 2)
+TESTATOBP(RAW, 1, 3, JNV21, 2, 2)
+TESTATOBP(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBP(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV21, 2, 2)
+
+#if !defined(LEAN_TESTS)
+
+#ifdef HAVE_JPEG
+TEST_F(LibYUVConvertTest, ValidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_page_end(orig_pixels, kSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // Test special value that matches marker start.
+ memset(orig_pixels, 0xff, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // EOI, SOI. Expect pass.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+ }
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ const int kMultiple = 10;
+ const int kBufSize = kImageSize * kMultiple + kOff;
+ align_buffer_page_end(orig_pixels, kBufSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kBufSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+
+ // EOI, SOI. Expect pass.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+ }
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, InvalidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_page_end(orig_pixels, kSize);
+
+ // NULL pointer. Expect fail.
+ EXPECT_FALSE(ValidateJpeg(NULL, kSize));
+
+ // Negative size. Expect fail.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
+
+ // Too large size. Expect fail.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // SOI but no EOI. Expect fail.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+ }
+
+ // EOI but no SOI. Expect fail.
+ orig_pixels[0] = 0;
+ orig_pixels[1] = 0;
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, FuzzJpeg) {
+ // SOI but no EOI. Expect fail.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ const int kSize = fastrand() % 5000 + 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ MemRandomize(orig_pixels, kSize);
+
+ // Add SOI so frame will be scanned.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
+ orig_pixels[kSize - 1] = 0xff;
+ ValidateJpeg(orig_pixels,
+ kSize); // Failure normally expected.
+ free_aligned_buffer_page_end(orig_pixels);
+ }
+}
+
+// Test data created in GIMP. In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
+// test 0 is J400
+static const uint8_t kTest0Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
+ 0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
+ 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+ 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+ 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+ 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+ 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+ 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+ 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+ 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
+ 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+ 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+ 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+ 0xd9};
+static const size_t kTest0JpgLen = 421;
+
+// test 1 is J444
+static const uint8_t kTest1Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+ 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
+ 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
+ 0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
+ 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
+ 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
+ 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
+ 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
+ 0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
+ 0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
+ 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
+ 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
+ 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
+ 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
+ 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
+ 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
+ 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
+ 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
+ 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
+ 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
+ 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+ 0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+ 0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
+ 0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
+ 0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
+ 0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
+ 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
+ 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
+ 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
+ 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
+ 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
+ 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
+ 0xd4, 0xff, 0xd9};
+static const size_t kTest1JpgLen = 735;
+
+// test 2 is J420
+static const uint8_t kTest2Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
+ 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
+ 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
+ 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
+ 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
+ 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+ 0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
+ 0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+ 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+ 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+ 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+ 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+ 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
+ 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
+ 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
+ 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
+ 0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
+ 0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+ 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+ 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
+ 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+ 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+ 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+ 0xd9};
+static const size_t kTest2JpgLen = 685;
+
+// test 3 is J422
+static const uint8_t kTest3Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+ 0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
+ 0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
+ 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
+ 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
+ 0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
+ 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
+ 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
+ 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
+ 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
+ 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
+ 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
+ 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
+ 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
+ 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+ 0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
+ 0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
+ 0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
+ 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
+ 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
+ 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
+ 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
+ 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest3JpgLen = 704;
+
+// test 4 is J422 vertical - not supported
+static const uint8_t kTest4Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
+ 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
+ 0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
+ 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
+ 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
+ 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
+ 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
+ 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
+ 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
+ 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
+ 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
+ 0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
+ 0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
+ 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
+ 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
+ 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
+ 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
+ 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
+ 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest4JpgLen = 701;
+
+TEST_F(LibYUVConvertTest, TestMJPGSize) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ printf("test jpeg size %d x %d\n", width, height);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_u, half_width * half_height);
+ align_buffer_page_end(dst_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
+ dst_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
+ uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_u_hash, 2501859930u);
+ EXPECT_EQ(dst_v_hash, 2126459123u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ // Convert to NV21
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert to I420
+ align_buffer_page_end(dst2_y, width * height);
+ align_buffer_page_end(dst2_u, half_width * half_height);
+ align_buffer_page_end(dst2_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+ dst2_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert I420 to NV21
+ align_buffer_page_end(dst3_y, width * height);
+ align_buffer_page_end(dst3_vu, half_width * half_height * 2);
+
+ I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+ width, dst3_vu, half_width * 2, width, height);
+
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_y[i], dst3_y[i]);
+ }
+ for (int i = 0; i < half_width * half_height * 2; ++i) {
+ EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+ EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+ }
+
+ free_aligned_buffer_page_end(dst3_y);
+ free_aligned_buffer_page_end(dst3_vu);
+
+ free_aligned_buffer_page_end(dst2_y);
+ free_aligned_buffer_page_end(dst2_u);
+ free_aligned_buffer_page_end(dst2_v);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ // Convert to NV12
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert to I420
+ align_buffer_page_end(dst2_y, width * height);
+ align_buffer_page_end(dst2_u, half_width * half_height);
+ align_buffer_page_end(dst2_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+ dst2_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert I420 to NV12
+ align_buffer_page_end(dst3_y, width * height);
+ align_buffer_page_end(dst3_uv, half_width * half_height * 2);
+
+ I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+ width, dst3_uv, half_width * 2, width, height);
+
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_y[i], dst3_y[i]);
+ }
+ for (int i = 0; i < half_width * half_height * 2; ++i) {
+ EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+ EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+ }
+
+ free_aligned_buffer_page_end(dst3_y);
+ free_aligned_buffer_page_end(dst3_uv);
+
+ free_aligned_buffer_page_end(dst2_y);
+ free_aligned_buffer_page_end(dst2_u);
+ free_aligned_buffer_page_end(dst2_v);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 1069662856u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_vu_hash, 1069662856u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+// TODO(fbarchard): Improve test to compare against I422, not checksum
+TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 493520167u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_vu_hash, 493520167u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 330644005u);
+ EXPECT_EQ(dst_uv_hash, 135214341u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 330644005u);
+ EXPECT_EQ(dst_vu_hash, 135214341u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 506143297u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value. Hashes are for VU so flip the plane.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+ SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+ half_height);
+ uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_vu_hash, 506143297u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+ if (benchmark_iterations < 1) {
+ benchmark_iterations = 1;
+ }
+
+ align_buffer_page_end(dst_argb, width * height * 4);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
+ height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+#ifdef LIBYUV_UNLIMITED_DATA
+ EXPECT_EQ(dst_argb_hash, 3900633302u);
+#else
+ EXPECT_EQ(dst_argb_hash, 2355976473u);
+#endif
+
+ free_aligned_buffer_page_end(dst_argb);
+}
+
+static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+
+ int width = mjpeg_decoder.GetWidth();
+ int height = mjpeg_decoder.GetHeight();
+
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J420, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J422, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J444, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ printf("JPeg is J400, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ } else {
+ // Unknown colorspace.
+ printf("JPeg is Unknown colorspace.\n");
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret;
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGInfo) {
+ EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+ kTest4JpgLen)); // Valid but unsupported.
+}
+#endif // HAVE_JPEG
+
+TEST_F(LibYUVConvertTest, NV12Crop) {
+ const int SUBSAMP_X = 2;
+ const int SUBSAMP_Y = 2;
+ const int kWidth = benchmark_width_;
+ const int kHeight = benchmark_height_;
+ const int crop_y =
+ ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+ const int kDestWidth = benchmark_width_;
+ const int kDestHeight = benchmark_height_ - crop_y * 2;
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int sample_size =
+ kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+ align_buffer_page_end(src_y, sample_size);
+ uint8_t* src_uv = src_y + kWidth * kHeight;
+
+ align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ for (int i = 0; i < kHeight * kWidth; ++i) {
+ src_y[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
+ src_uv[i] = (fastrand() & 0xff);
+ }
+ memset(dst_y, 1, kDestWidth * kDestHeight);
+ memset(dst_u, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_y_2, 1, kDestWidth * kDestHeight);
+ memset(dst_u_2, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v_2, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+ kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+ NV12ToI420(src_y + crop_y * kWidth, kWidth,
+ src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
+ kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
+
+ for (int i = 0; i < kDestHeight; ++i) {
+ for (int j = 0; j < kDestWidth; ++j) {
+ EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+ dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+ dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
+ free_aligned_buffer_page_end(dst_y_2);
+ free_aligned_buffer_page_end(dst_u_2);
+ free_aligned_buffer_page_end(dst_v_2);
+ free_aligned_buffer_page_end(src_y);
+}
+
+TEST_F(LibYUVConvertTest, I420CropOddY) {
+ const int SUBSAMP_X = 2;
+ const int SUBSAMP_Y = 2;
+ const int kWidth = benchmark_width_;
+ const int kHeight = benchmark_height_;
+ const int crop_y = benchmark_height_ > 1 ? 1 : 0;
+ const int kDestWidth = benchmark_width_;
+ const int kDestHeight = benchmark_height_ - crop_y * 2;
+ const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int sample_size = kWidth * kHeight +
+ kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
+ kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
+ align_buffer_page_end(src_y, sample_size);
+ uint8_t* src_u = src_y + kWidth * kHeight;
+ uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
+
+ align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ for (int i = 0; i < kHeight * kWidth; ++i) {
+ src_y[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
+ src_u[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
+ src_v[i] = (fastrand() & 0xff);
+ }
+ memset(dst_y, 1, kDestWidth * kDestHeight);
+ memset(dst_u, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+ kDestWidth, kDestHeight, libyuv::kRotate0,
+ libyuv::FOURCC_I420);
+ }
+
+ for (int i = 0; i < kDestHeight; ++i) {
+ for (int j = 0; j < kDestWidth; ++j) {
+ EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+ dst_y[i * kDestWidth + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+ dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+ dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
+ free_aligned_buffer_page_end(src_y);
+}
+
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \
+ TEST_F(LibYUVConvertTest, NAME) { \
+ const int kWidth = benchmark_width_; \
+ const int kHeight = benchmark_height_; \
+ \
+ align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
+ align_buffer_page_end(orig_y, kWidth* kHeight); \
+ align_buffer_page_end(orig_u, \
+ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ align_buffer_page_end(orig_v, \
+ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ align_buffer_page_end(dst_y_orig, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_orig, \
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ align_buffer_page_end(dst_y, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv, \
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
+ \
+ /* Convert UYVY to NV12 in 2 steps for reference */ \
+ libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth, \
+ orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
+ SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
+ SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
+ 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ \
+ /* Convert to NV12 */ \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth, \
+ dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ } \
+ \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ EXPECT_EQ(orig_y[i], dst_y[i]); \
+ } \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ EXPECT_EQ(dst_y_orig[i], dst_y[i]); \
+ } \
+ for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); \
+ ++i) { \
+ EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \
+ } \
+ \
+ free_aligned_buffer_page_end(orig_uyvy); \
+ free_aligned_buffer_page_end(orig_y); \
+ free_aligned_buffer_page_end(orig_u); \
+ free_aligned_buffer_page_end(orig_v); \
+ free_aligned_buffer_page_end(dst_y_orig); \
+ free_aligned_buffer_page_end(dst_uv_orig); \
+ free_aligned_buffer_page_end(dst_y); \
+ free_aligned_buffer_page_end(dst_uv); \
+ }
+
+TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
+TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+
+TEST_F(LibYUVConvertTest, MM21ToYUY2) {
+ const int kWidth = (benchmark_width_ + 15) & (~15);
+ const int kHeight = (benchmark_height_ + 31) & (~31);
+
+ align_buffer_page_end(orig_y, kWidth * kHeight);
+ align_buffer_page_end(orig_uv,
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+ align_buffer_page_end(tmp_y, kWidth * kHeight);
+ align_buffer_page_end(tmp_u, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+ align_buffer_page_end(tmp_v, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+ align_buffer_page_end(dst_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+ align_buffer_page_end(golden_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+
+ MemRandomize(orig_y, kWidth * kHeight);
+ MemRandomize(orig_uv, 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+ /* Convert MM21 to YUY2 in 2 steps for reference */
+ libyuv::MM21ToI420(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), tmp_y,
+ kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+ SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+ libyuv::I420ToYUY2(tmp_y, kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+ SUBSAMPLE(kWidth, 2), golden_yuyv,
+ 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+
+ /* Convert to NV12 */
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ libyuv::MM21ToYUY2(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2),
+ dst_yuyv, 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+ }
+
+ for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) {
+ EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ free_aligned_buffer_page_end(orig_uv);
+ free_aligned_buffer_page_end(tmp_y);
+ free_aligned_buffer_page_end(tmp_u);
+ free_aligned_buffer_page_end(tmp_v);
+ free_aligned_buffer_page_end(dst_yuyv);
+ free_aligned_buffer_page_end(golden_yuyv);
+}
+
+// Test RGB24 to J420 is exact
+#if defined(LIBYUV_BIT_EXACT)
+TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
+ const int kSize = 256;
+ align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
+ align_buffer_page_end(dest_j420, kSize * 3 / 2 * 2);
+ int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
+ (kSize * 2) * benchmark_iterations_;
+
+ for (int i = 0; i < kSize * 3 * 2; ++i) {
+ orig_rgb24[i] = i;
+ }
+
+ for (int i = 0; i < iterations256; ++i) {
+ RGB24ToJ420(orig_rgb24, kSize * 3, dest_j420, kSize, // Y plane
+ dest_j420 + kSize * 2, kSize / 2, // U plane
+ dest_j420 + kSize * 5 / 2, kSize / 2, // V plane
+ kSize, 2);
+ }
+
+ uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381);
+ EXPECT_EQ(2755440272u, checksum);
+
+ free_aligned_buffer_page_end(orig_rgb24);
+ free_aligned_buffer_page_end(dest_j420);
+}
+#endif
+
+// Test RGB24 to I420 is exact
+#if defined(LIBYUV_BIT_EXACT)
+TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
+ const int kSize = 256;
+ align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
+ align_buffer_page_end(dest_i420, kSize * 3 / 2 * 2);
+ int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
+ (kSize * 2) * benchmark_iterations_;
+
+ for (int i = 0; i < kSize * 3 * 2; ++i) {
+ orig_rgb24[i] = i;
+ }
+
+ for (int i = 0; i < iterations256; ++i) {
+ RGB24ToI420(orig_rgb24, kSize * 3, dest_i420, kSize, // Y plane
+ dest_i420 + kSize * 2, kSize / 2, // U plane
+ dest_i420 + kSize * 5 / 2, kSize / 2, // V plane
+ kSize, 2);
+ }
+
+ uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381);
+ EXPECT_EQ(1526656597u, checksum);
+
+ free_aligned_buffer_page_end(orig_rgb24);
+ free_aligned_buffer_page_end(dest_i420);
+}
+#endif
+
+#endif // !defined(LEAN_TESTS)
+
+} // namespace libyuv
diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc
new file mode 100644
index 00000000..437b6632
--- /dev/null
+++ b/unit_test/cpu_test.cc
@@ -0,0 +1,342 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/version.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestCpuHas) {
+ int cpu_flags = TestCpuFlag(-1);
+ printf("Cpu Flags 0x%x\n", cpu_flags);
+#if defined(__arm__) || defined(__aarch64__)
+ int has_arm = TestCpuFlag(kCpuHasARM);
+ printf("Has ARM 0x%x\n", has_arm);
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ printf("Has NEON 0x%x\n", has_neon);
+#endif
+#if defined(__riscv) && defined(__linux__)
+ int has_riscv = TestCpuFlag(kCpuHasRISCV);
+ printf("Has RISCV 0x%x\n", has_riscv);
+ int has_rvv = TestCpuFlag(kCpuHasRVV);
+ printf("Has RVV 0x%x\n", has_rvv);
+ int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH);
+ printf("Has RVVZVFH 0x%x\n", has_rvvzvfh);
+#endif
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ int has_avx = TestCpuFlag(kCpuHasAVX);
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_erms = TestCpuFlag(kCpuHasERMS);
+ int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
+ int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+ int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+ int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
+ int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+ int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+ int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+ int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+ int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+ int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+ printf("Has X86 0x%x\n", has_x86);
+ printf("Has SSE2 0x%x\n", has_sse2);
+ printf("Has SSSE3 0x%x\n", has_ssse3);
+ printf("Has SSE41 0x%x\n", has_sse41);
+ printf("Has SSE42 0x%x\n", has_sse42);
+ printf("Has AVX 0x%x\n", has_avx);
+ printf("Has AVX2 0x%x\n", has_avx2);
+ printf("Has ERMS 0x%x\n", has_erms);
+ printf("Has FMA3 0x%x\n", has_fma3);
+ printf("Has F16C 0x%x\n", has_f16c);
+ printf("Has AVX512BW 0x%x\n", has_avx512bw);
+ printf("Has AVX512VL 0x%x\n", has_avx512vl);
+ printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+ printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+ printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+ printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+ printf("Has AVX10 0x%x\n", has_avx10);
+ printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+ printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
+#endif
+#if defined(__mips__)
+ int has_mips = TestCpuFlag(kCpuHasMIPS);
+ printf("Has MIPS 0x%x\n", has_mips);
+ int has_msa = TestCpuFlag(kCpuHasMSA);
+ printf("Has MSA 0x%x\n", has_msa);
+#endif
+#if defined(__loongarch__)
+ int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
+ printf("Has LOONGARCH 0x%x\n", has_loongarch);
+ int has_lsx = TestCpuFlag(kCpuHasLSX);
+ printf("Has LSX 0x%x\n", has_lsx);
+ int has_lasx = TestCpuFlag(kCpuHasLASX);
+ printf("Has LASX 0x%x\n", has_lasx);
+#endif
+}
+
+TEST_F(LibYUVBaseTest, TestCompilerMacros) {
+ // Tests all macros used in public headers.
+#ifdef __ATOMIC_RELAXED
+ printf("__ATOMIC_RELAXED %d\n", __ATOMIC_RELAXED);
+#endif
+#ifdef __cplusplus
+ printf("__cplusplus %ld\n", __cplusplus);
+#endif
+#ifdef __clang_major__
+ printf("__clang_major__ %d\n", __clang_major__);
+#endif
+#ifdef __clang_minor__
+ printf("__clang_minor__ %d\n", __clang_minor__);
+#endif
+#ifdef __GNUC__
+ printf("__GNUC__ %d\n", __GNUC__);
+#endif
+#ifdef __GNUC_MINOR__
+ printf("__GNUC_MINOR__ %d\n", __GNUC_MINOR__);
+#endif
+#ifdef __i386__
+ printf("__i386__ %d\n", __i386__);
+#endif
+#ifdef __x86_64__
+ printf("__x86_64__ %d\n", __x86_64__);
+#endif
+#ifdef _M_IX86
+ printf("_M_IX86 %d\n", _M_IX86);
+#endif
+#ifdef _M_X64
+ printf("_M_X64 %d\n", _M_X64);
+#endif
+#ifdef _MSC_VER
+ printf("_MSC_VER %d\n", _MSC_VER);
+#endif
+#ifdef __aarch64__
+ printf("__aarch64__ %d\n", __aarch64__);
+#endif
+#ifdef __arm__
+ printf("__arm__ %d\n", __arm__);
+#endif
+#ifdef __riscv
+ printf("__riscv %d\n", __riscv);
+#endif
+#ifdef __riscv_vector
+ printf("__riscv_vector %d\n", __riscv_vector);
+#endif
+#ifdef __riscv_v_intrinsic
+ printf("__riscv_v_intrinsic %d\n", __riscv_v_intrinsic);
+#endif
+#ifdef __APPLE__
+ printf("__APPLE__ %d\n", __APPLE__);
+#endif
+#ifdef __clang__
+ printf("__clang__ %d\n", __clang__);
+#endif
+#ifdef __CLR_VER
+ printf("__CLR_VER %d\n", __CLR_VER);
+#endif
+#ifdef __CYGWIN__
+ printf("__CYGWIN__ %d\n", __CYGWIN__);
+#endif
+#ifdef __llvm__
+ printf("__llvm__ %d\n", __llvm__);
+#endif
+#ifdef __mips_msa
+ printf("__mips_msa %d\n", __mips_msa);
+#endif
+#ifdef __mips
+ printf("__mips %d\n", __mips);
+#endif
+#ifdef __mips_isa_rev
+ printf("__mips_isa_rev %d\n", __mips_isa_rev);
+#endif
+#ifdef _MIPS_ARCH_LOONGSON3A
+ printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
+#endif
+#ifdef __loongarch__
+ printf("__loongarch__ %d\n", __loongarch__);
+#endif
+#ifdef _WIN32
+ printf("_WIN32 %d\n", _WIN32);
+#endif
+#ifdef __native_client__
+ printf("__native_client__ %d\n", __native_client__);
+#endif
+#ifdef __pic__
+ printf("__pic__ %d\n", __pic__);
+#endif
+#ifdef __pnacl__
+ printf("__pnacl__ %d\n", __pnacl__);
+#endif
+#ifdef GG_LONGLONG
+ printf("GG_LONGLONG %lld\n", GG_LONGLONG(1));
+#endif
+#ifdef INT_TYPES_DEFINED
+ printf("INT_TYPES_DEFINED\n");
+#endif
+#ifdef __has_feature
+ printf("__has_feature\n");
+#if __has_feature(memory_sanitizer)
+ printf("__has_feature(memory_sanitizer) %d\n",
+ __has_feature(memory_sanitizer));
+#endif
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
+TEST_F(LibYUVBaseTest, TestCpuId) {
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ if (has_x86) {
+ int cpu_info[4];
+ // Vendor ID:
+ // AuthenticAMD AMD processor
+ // CentaurHauls Centaur processor
+ // CyrixInstead Cyrix processor
+ // GenuineIntel Intel processor
+ // GenuineTMx86 Transmeta processor
+ // Geode by NSC National Semiconductor processor
+ // NexGenDriven NexGen processor
+ // RiseRiseRise Rise Technology processor
+ // SiS SiS SiS SiS processor
+ // UMC UMC UMC UMC processor
+ CpuId(0, 0, cpu_info);
+ cpu_info[0] = cpu_info[1]; // Reorder output
+ cpu_info[1] = cpu_info[3];
+ cpu_info[3] = 0;
+ printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
+ reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
+ cpu_info[2]);
+ EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+
+ // CPU Family and Model
+ // 3:0 - Stepping
+ // 7:4 - Model
+ // 11:8 - Family
+ // 13:12 - Processor Type
+ // 19:16 - Extended Model
+ // 27:20 - Extended Family
+ CpuId(1, 0, cpu_info);
+ int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+ int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+ printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
+ model);
+ }
+}
+#endif
+
+static int FileExists(const char* file_name) {
+ FILE* f = fopen(file_name, "r");
+ if (!f) {
+ return 0;
+ }
+ fclose(f);
+ return 1;
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxNeon) {
+ if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
+ printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+
+ EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+ EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+ EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
+ } else {
+ printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+ }
+#if defined(__linux__) && defined(__ARM_NEON__)
+ if (FileExists("/proc/cpuinfo")) {
+ if (kCpuHasNEON != ArmCpuCaps("/proc/cpuinfo")) {
+ // This can happen on ARM emulator but /proc/cpuinfo is from host.
+ printf("WARNING: Neon build enabled but CPU does not have NEON\n");
+ }
+ } else {
+ printf("WARNING: unable to load \"/proc/cpuinfo\"\n");
+ }
+#endif
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxMipsMsa) {
+ if (FileExists("../../unit_test/testdata/mips.txt")) {
+ printf("Note: testing to load \"../../unit_test/testdata/mips.txt\"\n");
+
+ EXPECT_EQ(0, MipsCpuCaps("../../unit_test/testdata/mips.txt"));
+ EXPECT_EQ(kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_msa.txt"));
+ EXPECT_EQ(kCpuHasMSA,
+ MipsCpuCaps("../../unit_test/testdata/mips_loongson2k.txt"));
+ } else {
+ printf("WARNING: unable to load \"../../unit_test/testdata/mips.txt\"\n");
+ }
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxRVV) {
+ if (FileExists("../../unit_test/testdata/riscv64.txt")) {
+ printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n");
+
+ EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
+ EXPECT_EQ(kCpuHasRVV,
+ RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt"));
+ EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
+ RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt"));
+ } else {
+ printf(
+ "WARNING: unable to load "
+ "\"../../unit_test/testdata/riscv64.txt\"\n");
+ }
+#if defined(__linux__) && defined(__riscv)
+ if (FileExists("/proc/cpuinfo")) {
+ if (!(kCpuHasRVV & RiscvCpuCaps("/proc/cpuinfo"))) {
+ // This can happen on RVV emulator but /proc/cpuinfo is from host.
+ printf("WARNING: RVV build enabled but CPU does not have RVV\n");
+ }
+ } else {
+ printf("WARNING: unable to load \"/proc/cpuinfo\"\n");
+ }
+#endif
+}
+
+// TODO(fbarchard): Fix clangcl test of cpuflags.
+#ifdef _MSC_VER
+TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) {
+#else
+TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
+#endif
+ // Reset any masked flags that may have been set so auto init is enabled.
+ MaskCpuFlags(0);
+
+ int original_cpu_flags = TestCpuFlag(-1);
+
+ // Test setting different CPU configurations.
+ int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
+ SetCpuFlags(cpu_flags);
+ EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
+
+ cpu_flags = kCpuHasX86 | kCpuInitialized;
+ SetCpuFlags(cpu_flags);
+ EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
+
+ // Test that setting 0 turns auto-init back on.
+ SetCpuFlags(0);
+ EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));
+
+ // Restore the CPU flag mask.
+ MaskCpuFlags(benchmark_cpu_info_);
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/cpu_thread_test.cc b/unit_test/cpu_thread_test.cc
index 59061b98..69aab74e 100644
--- a/files/unit_test/cpu_thread_test.cc
+++ b/unit_test/cpu_thread_test.cc
@@ -12,7 +12,7 @@
#include "libyuv/cpu_id.h"
-#if defined(__clang__)
+#if defined(__clang__) && !defined(__wasm__)
#if __has_include(<pthread.h>)
#define LIBYUV_HAVE_PTHREAD 1
#endif
@@ -30,7 +30,7 @@ namespace libyuv {
void* ThreadMain(void* arg) {
int* flags = static_cast<int*>(arg);
- *flags = TestCpuFlag(kCpuHasSSSE3);
+ *flags = TestCpuFlag(kCpuInitialized);
return nullptr;
}
#endif // LIBYUV_HAVE_PTHREAD
diff --git a/files/unit_test/math_test.cc b/unit_test/math_test.cc
index 0abbad51..a1544c12 100644
--- a/files/unit_test/math_test.cc
+++ b/unit_test/math_test.cc
@@ -16,10 +16,14 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
#include "libyuv/scale_row.h"
+#endif
namespace libyuv {
+#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVBaseTest, TestFixedDiv) {
int num[1280];
int div[1280];
@@ -151,5 +155,6 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
EXPECT_NEAR(result_c[j], result_opt[j], 1);
}
}
+#endif // ENABLE_ROW_TESTS
} // namespace libyuv
diff --git a/files/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 70f8966e..ec1d72eb 100644
--- a/files/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -12,9 +12,6 @@
#include <stdlib.h>
#include <time.h>
-// row.h defines SIMD_ALIGNED, overriding unit_test.h
-#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
-
#include "../unit_test/unit_test.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@@ -24,6 +21,19 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+// TODO(fbarchard): Remove row.h from unittests. Test public functions.
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+#endif
+
+#if defined(LIBYUV_BIT_EXACT)
+#define EXPECTED_UNATTENUATE_DIFF 0
+#else
+#define EXPECTED_UNATTENUATE_DIFF 2
+#endif
namespace libyuv {
@@ -47,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
orig_pixels[2 * 4 + 0] = 16u;
orig_pixels[2 * 4 + 1] = 64u;
orig_pixels[2 * 4 + 2] = 192u;
- orig_pixels[2 * 4 + 3] = 255u;
+ orig_pixels[2 * 4 + 3] = 128u;
orig_pixels[3 * 4 + 0] = 16u;
orig_pixels[3 * 4 + 1] = 64u;
orig_pixels[3 * 4 + 2] = 192u;
- orig_pixels[3 * 4 + 3] = 128u;
- ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+ orig_pixels[3 * 4 + 3] = 255u;
+ orig_pixels[4 * 4 + 0] = 255u;
+ orig_pixels[4 * 4 + 1] = 255u;
+ orig_pixels[4 * 4 + 2] = 255u;
+ orig_pixels[4 * 4 + 3] = 255u;
+
+ ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1);
EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
@@ -61,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
- EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
- EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
- EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
- EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
- EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
- EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
- EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
- EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+ EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]);
+ EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]);
+ EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]);
+ EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]);
+ EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]);
+ EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]);
+ EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]);
+ EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]);
+ EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]);
+ EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]);
+ EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]);
+ EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]);
+
+ ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1);
+ EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]);
+ EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]);
+ EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]);
+ EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]);
+ EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]);
+ EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]);
+ EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]);
+ EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]);
+ EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]);
+ EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]);
+ EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]);
+ EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]);
+ EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]);
+ EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]);
+ EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]);
+ EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]);
+ EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]);
+ EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]);
+ EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]);
+ EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]);
+
+ // test 255
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = 0;
+ orig_pixels[i * 4 + 2] = 0;
+ orig_pixels[i * 4 + 3] = 255;
+ }
+ ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1);
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
+ EXPECT_EQ(0, atten_pixels[i * 4 + 1]);
+ EXPECT_EQ(0, atten_pixels[i * 4 + 2]);
+ EXPECT_EQ(255, atten_pixels[i * 4 + 3]);
+ }
for (int i = 0; i < 1280; ++i) {
orig_pixels[i * 4 + 0] = i;
@@ -82,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
}
for (int i = 0; i < 1280; ++i) {
- EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
- EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
- EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
- EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
+ EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
+ EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
+ EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
}
// Make sure transparent, 50% and opaque are fully accurate.
EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
@@ -96,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
- EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
- EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
- EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
+ EXPECT_EQ(255, atten_pixels[255 * 4 + 0]);
+ EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
+ EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
free_aligned_buffer_page_end(atten2_pixels);
@@ -151,31 +207,32 @@ static int TestAttenuateI(int width,
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
- int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestAttenuateI(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
- EXPECT_LE(max_diff, 2);
+
+ EXPECT_EQ(max_diff, 0);
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
int max_diff =
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
- EXPECT_LE(max_diff, 2);
+ EXPECT_EQ(max_diff, 0);
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
int max_diff =
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
- EXPECT_LE(max_diff, 2);
+ EXPECT_EQ(max_diff, 0);
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
int max_diff =
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
- EXPECT_LE(max_diff, 2);
+ EXPECT_EQ(max_diff, 0);
}
static int TestUnattenuateI(int width,
@@ -224,31 +281,31 @@ static int TestUnattenuateI(int width,
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
- int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
- EXPECT_LE(max_diff, 2);
+ EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 1);
- EXPECT_LE(max_diff, 2);
+ EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, -1, 0);
- EXPECT_LE(max_diff, 2);
+ EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
- EXPECT_LE(max_diff, 2);
+ EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
}
TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
@@ -277,6 +334,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
}
}
+// near is for legacy platforms.
TEST_F(LibYUVPlanarTest, TestARGBGray) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
@@ -313,17 +371,17 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
orig_pixels[5][3] = 224u;
// Do 16 to test asm version.
ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
- EXPECT_EQ(30u, orig_pixels[0][0]);
- EXPECT_EQ(30u, orig_pixels[0][1]);
- EXPECT_EQ(30u, orig_pixels[0][2]);
+ EXPECT_NEAR(29u, orig_pixels[0][0], 1);
+ EXPECT_NEAR(29u, orig_pixels[0][1], 1);
+ EXPECT_NEAR(29u, orig_pixels[0][2], 1);
EXPECT_EQ(128u, orig_pixels[0][3]);
EXPECT_EQ(149u, orig_pixels[1][0]);
EXPECT_EQ(149u, orig_pixels[1][1]);
EXPECT_EQ(149u, orig_pixels[1][2]);
EXPECT_EQ(0u, orig_pixels[1][3]);
- EXPECT_EQ(76u, orig_pixels[2][0]);
- EXPECT_EQ(76u, orig_pixels[2][1]);
- EXPECT_EQ(76u, orig_pixels[2][2]);
+ EXPECT_NEAR(77u, orig_pixels[2][0], 1);
+ EXPECT_NEAR(77u, orig_pixels[2][1], 1);
+ EXPECT_NEAR(77u, orig_pixels[2][2], 1);
EXPECT_EQ(255u, orig_pixels[2][3]);
EXPECT_EQ(0u, orig_pixels[3][0]);
EXPECT_EQ(0u, orig_pixels[3][1]);
@@ -333,9 +391,9 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
EXPECT_EQ(255u, orig_pixels[4][1]);
EXPECT_EQ(255u, orig_pixels[4][2]);
EXPECT_EQ(255u, orig_pixels[4][3]);
- EXPECT_EQ(96u, orig_pixels[5][0]);
- EXPECT_EQ(96u, orig_pixels[5][1]);
- EXPECT_EQ(96u, orig_pixels[5][2]);
+ EXPECT_NEAR(97u, orig_pixels[5][0], 1);
+ EXPECT_NEAR(97u, orig_pixels[5][1], 1);
+ EXPECT_NEAR(97u, orig_pixels[5][2], 1);
EXPECT_EQ(224u, orig_pixels[5][3]);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
@@ -385,30 +443,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
orig_pixels[5][3] = 224u;
// Do 16 to test asm version.
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
- EXPECT_EQ(30u, gray_pixels[0][0]);
- EXPECT_EQ(30u, gray_pixels[0][1]);
- EXPECT_EQ(30u, gray_pixels[0][2]);
- EXPECT_EQ(128u, gray_pixels[0][3]);
- EXPECT_EQ(149u, gray_pixels[1][0]);
- EXPECT_EQ(149u, gray_pixels[1][1]);
- EXPECT_EQ(149u, gray_pixels[1][2]);
- EXPECT_EQ(0u, gray_pixels[1][3]);
- EXPECT_EQ(76u, gray_pixels[2][0]);
- EXPECT_EQ(76u, gray_pixels[2][1]);
- EXPECT_EQ(76u, gray_pixels[2][2]);
- EXPECT_EQ(255u, gray_pixels[2][3]);
- EXPECT_EQ(0u, gray_pixels[3][0]);
- EXPECT_EQ(0u, gray_pixels[3][1]);
- EXPECT_EQ(0u, gray_pixels[3][2]);
- EXPECT_EQ(255u, gray_pixels[3][3]);
- EXPECT_EQ(255u, gray_pixels[4][0]);
- EXPECT_EQ(255u, gray_pixels[4][1]);
- EXPECT_EQ(255u, gray_pixels[4][2]);
- EXPECT_EQ(255u, gray_pixels[4][3]);
- EXPECT_EQ(96u, gray_pixels[5][0]);
- EXPECT_EQ(96u, gray_pixels[5][1]);
- EXPECT_EQ(96u, gray_pixels[5][2]);
- EXPECT_EQ(224u, gray_pixels[5][3]);
+ EXPECT_NEAR(30u, gray_pixels[0][0], 1);
+ EXPECT_NEAR(30u, gray_pixels[0][1], 1);
+ EXPECT_NEAR(30u, gray_pixels[0][2], 1);
+ EXPECT_NEAR(128u, gray_pixels[0][3], 1);
+ EXPECT_NEAR(149u, gray_pixels[1][0], 1);
+ EXPECT_NEAR(149u, gray_pixels[1][1], 1);
+ EXPECT_NEAR(149u, gray_pixels[1][2], 1);
+ EXPECT_NEAR(0u, gray_pixels[1][3], 1);
+ EXPECT_NEAR(76u, gray_pixels[2][0], 1);
+ EXPECT_NEAR(76u, gray_pixels[2][1], 1);
+ EXPECT_NEAR(76u, gray_pixels[2][2], 1);
+ EXPECT_NEAR(255u, gray_pixels[2][3], 1);
+ EXPECT_NEAR(0u, gray_pixels[3][0], 1);
+ EXPECT_NEAR(0u, gray_pixels[3][1], 1);
+ EXPECT_NEAR(0u, gray_pixels[3][2], 1);
+ EXPECT_NEAR(255u, gray_pixels[3][3], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][0], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][1], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][2], 1);
+ EXPECT_NEAR(255u, gray_pixels[4][3], 1);
+ EXPECT_NEAR(96u, gray_pixels[5][0], 1);
+ EXPECT_NEAR(96u, gray_pixels[5][1], 1);
+ EXPECT_NEAR(96u, gray_pixels[5][2], 1);
+ EXPECT_NEAR(224u, gray_pixels[5][3], 1);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
@@ -418,6 +476,20 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
}
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i;
+ orig_pixels[i][2] = i;
+ orig_pixels[i][3] = i;
+ }
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(i, orig_pixels[i][0]);
+ EXPECT_EQ(i, orig_pixels[i][1]);
+ EXPECT_EQ(i, orig_pixels[i][2]);
+ EXPECT_EQ(i, orig_pixels[i][3]);
+ }
}
TEST_F(LibYUVPlanarTest, TestARGBSepia) {
@@ -763,27 +835,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
}
}
-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
- SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_pixels_opt,
+ benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
- for (int i = 0; i < 1280; ++i) {
- orig_pixels[i][0] = i;
- orig_pixels[i][1] = i / 2;
- orig_pixels[i][2] = i / 3;
- orig_pixels[i][3] = i / 4;
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+ MaskCpuFlags(disable_cpu_flags_);
+ ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
- for (int i = 0; i < 1280; ++i) {
- EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
- EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
- EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
- EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
+
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
+ MaskCpuFlags(disable_cpu_flags_);
+ MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
+ benchmark_width_, benchmark_height_);
}
- for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_pixels_opt,
+ benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
+
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+ MaskCpuFlags(disable_cpu_flags_);
+ MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
}
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, TestShade) {
@@ -1006,10 +1126,91 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
}
}
+TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
+ SIMD_ALIGNED(uint16_t orig_pixels_0[1280]);
+ SIMD_ALIGNED(uint16_t orig_pixels_1[1280]);
+ SIMD_ALIGNED(uint16_t interpolate_pixels[1280]);
+ memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+ memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+ orig_pixels_0[0] = 16u;
+ orig_pixels_0[1] = 32u;
+ orig_pixels_0[2] = 64u;
+ orig_pixels_0[3] = 128u;
+ orig_pixels_0[4] = 0u;
+ orig_pixels_0[5] = 0u;
+ orig_pixels_0[6] = 0u;
+ orig_pixels_0[7] = 255u;
+ orig_pixels_0[8] = 0u;
+ orig_pixels_0[9] = 0u;
+ orig_pixels_0[10] = 0u;
+ orig_pixels_0[11] = 0u;
+ orig_pixels_0[12] = 0u;
+ orig_pixels_0[13] = 0u;
+ orig_pixels_0[14] = 0u;
+ orig_pixels_0[15] = 0u;
+
+ orig_pixels_1[0] = 0u;
+ orig_pixels_1[1] = 0u;
+ orig_pixels_1[2] = 0u;
+ orig_pixels_1[3] = 0u;
+ orig_pixels_1[4] = 0u;
+ orig_pixels_1[5] = 0u;
+ orig_pixels_1[6] = 0u;
+ orig_pixels_1[7] = 0u;
+ orig_pixels_1[8] = 0u;
+ orig_pixels_1[9] = 0u;
+ orig_pixels_1[10] = 0u;
+ orig_pixels_1[11] = 0u;
+ orig_pixels_1[12] = 255u;
+ orig_pixels_1[13] = 255u;
+ orig_pixels_1[14] = 255u;
+ orig_pixels_1[15] = 255u;
+
+ InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 16, 1, 128);
+ EXPECT_EQ(8u, interpolate_pixels[0]);
+ EXPECT_EQ(16u, interpolate_pixels[1]);
+ EXPECT_EQ(32u, interpolate_pixels[2]);
+ EXPECT_EQ(64u, interpolate_pixels[3]);
+ EXPECT_EQ(0u, interpolate_pixels[4]);
+ EXPECT_EQ(0u, interpolate_pixels[5]);
+ EXPECT_EQ(0u, interpolate_pixels[6]);
+ EXPECT_EQ(128u, interpolate_pixels[7]);
+ EXPECT_EQ(0u, interpolate_pixels[8]);
+ EXPECT_EQ(0u, interpolate_pixels[9]);
+ EXPECT_EQ(0u, interpolate_pixels[10]);
+ EXPECT_EQ(0u, interpolate_pixels[11]);
+ EXPECT_EQ(128u, interpolate_pixels[12]);
+ EXPECT_EQ(128u, interpolate_pixels[13]);
+ EXPECT_EQ(128u, interpolate_pixels[14]);
+ EXPECT_EQ(128u, interpolate_pixels[15]);
+
+ InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 16, 1, 0);
+ EXPECT_EQ(16u, interpolate_pixels[0]);
+ EXPECT_EQ(32u, interpolate_pixels[1]);
+ EXPECT_EQ(64u, interpolate_pixels[2]);
+ EXPECT_EQ(128u, interpolate_pixels[3]);
+
+ InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 16, 1, 192);
+
+ EXPECT_EQ(4u, interpolate_pixels[0]);
+ EXPECT_EQ(8u, interpolate_pixels[1]);
+ EXPECT_EQ(16u, interpolate_pixels[2]);
+ EXPECT_EQ(32u, interpolate_pixels[3]);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 1280, 1, 123);
+ }
+}
+
#define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \
N, NEG, OFF) \
TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kWidth = W1280; \
const int kHeight = benchmark_height_; \
const int kStrideA = \
(kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
@@ -1041,7 +1242,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
}
#define TESTINTERPOLATE(TERP) \
- TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ + 1, TERP, _Any, +, 0) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
@@ -1058,7 +1259,8 @@ static int TestBlend(int width,
int disable_cpu_flags,
int benchmark_cpu_info,
int invert,
- int off) {
+ int off,
+ int attenuate) {
if (width < 1) {
width = 1;
}
@@ -1072,10 +1274,12 @@ static int TestBlend(int width,
src_argb_a[i + off] = (fastrand() & 0xff);
src_argb_b[i + off] = (fastrand() & 0xff);
}
- ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
- height);
- ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
- height);
+ MemRandomize(src_argb_a, kStride * height + off);
+ MemRandomize(src_argb_b, kStride * height + off);
+ if (attenuate) {
+ ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+ height);
+ }
memset(dst_argb_c, 255, kStride * height);
memset(dst_argb_opt, 255, kStride * height);
@@ -1104,29 +1308,36 @@ static int TestBlend(int width,
TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
int max_diff =
- TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ TestBlend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) {
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_LE(max_diff, 1);
}
@@ -1203,7 +1414,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
}
TEST_F(LibYUVPlanarTest, BlendPlane_Any) {
- TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+ TestBlendPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
}
TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
@@ -1298,7 +1509,7 @@ TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
// TODO(fbarchard): DISABLED because _Any uses C. Avoid C and re-enable.
TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
- TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+ TestI420Blend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
}
TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
@@ -1400,6 +1611,251 @@ TEST_F(LibYUVPlanarTest, TestCopyPlane) {
EXPECT_EQ(0, err);
}
+TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
+ int i;
+ int y_plane_size = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_y, y_plane_size);
+ align_buffer_page_end(dst_c, y_plane_size);
+ align_buffer_page_end(dst_opt, y_plane_size);
+
+ MemRandomize(orig_y, y_plane_size);
+ memset(dst_c, 1, y_plane_size);
+ memset(dst_opt, 2, y_plane_size);
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags_);
+ for (i = 0; i < benchmark_iterations_; i++) {
+ CopyPlane(orig_y, benchmark_width_, dst_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ }
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (i = 0; i < benchmark_iterations_; i++) {
+ CopyPlane(orig_y, benchmark_width_, dst_opt, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ }
+
+ for (i = 0; i < y_plane_size; ++i) {
+ EXPECT_EQ(dst_c[i], dst_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ free_aligned_buffer_page_end(dst_c);
+ free_aligned_buffer_page_end(dst_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestCopyPlaneZero) {
+ // Test to verify copying a rect with a zero height or width does
+ // not touch destination memory.
+ uint8_t src = 42;
+ uint8_t dst = 0;
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags_);
+ CopyPlane(&src, 0, &dst, 0, 0, 0);
+ EXPECT_EQ(src, 42);
+ EXPECT_EQ(dst, 0);
+
+ CopyPlane(&src, 1, &dst, 1, 1, 0);
+ EXPECT_EQ(src, 42);
+ EXPECT_EQ(dst, 0);
+
+ CopyPlane(&src, 1, &dst, 1, 0, 1);
+ EXPECT_EQ(src, 42);
+ EXPECT_EQ(dst, 0);
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info_);
+ CopyPlane(&src, 0, &dst, 0, 0, 0);
+ EXPECT_EQ(src, 42);
+ EXPECT_EQ(dst, 0);
+
+ CopyPlane(&src, 1, &dst, 1, 1, 0);
+ EXPECT_EQ(src, 42);
+ EXPECT_EQ(dst, 0);
+
+ CopyPlane(&src, 1, &dst, 1, 0, 1);
+ EXPECT_EQ(src, 42);
+ EXPECT_EQ(dst, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetilePlane) {
+ int i, j;
+
+ // orig is tiled. Allocate enough memory for tiles.
+ int tile_width = (benchmark_width_ + 15) & ~15;
+ int tile_height = (benchmark_height_ + 15) & ~15;
+ int tile_plane_size = tile_width * tile_height;
+ int y_plane_size = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(tile_y, tile_plane_size);
+ align_buffer_page_end(dst_c, y_plane_size);
+ align_buffer_page_end(dst_opt, y_plane_size);
+
+ MemRandomize(tile_y, tile_plane_size);
+ memset(dst_c, 0, y_plane_size);
+ memset(dst_opt, 0, y_plane_size);
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags_);
+ for (j = 0; j < benchmark_iterations_; j++) {
+ DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_,
+ benchmark_height_, 16);
+ }
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (j = 0; j < benchmark_iterations_; j++) {
+ DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_,
+ benchmark_height_, 16);
+ }
+
+ for (i = 0; i < y_plane_size; ++i) {
+ EXPECT_EQ(dst_c[i], dst_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(tile_y);
+ free_aligned_buffer_page_end(dst_c);
+ free_aligned_buffer_page_end(dst_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
+ int i, j;
+
+ // orig is tiled. Allocate enough memory for tiles.
+ int tile_width = (benchmark_width_ + 15) & ~15;
+ int tile_height = (benchmark_height_ + 15) & ~15;
+ int tile_plane_size = tile_width * tile_height * 2;
+ int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+ align_buffer_page_end(tile_y, tile_plane_size);
+ align_buffer_page_end(dst_c, y_plane_size);
+ align_buffer_page_end(dst_opt, y_plane_size);
+
+ MemRandomize(tile_y, tile_plane_size);
+ memset(dst_c, 0, y_plane_size);
+ memset(dst_opt, 0, y_plane_size);
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags_);
+ for (j = 0; j < benchmark_iterations_; j++) {
+ DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c,
+ benchmark_width_, benchmark_width_, benchmark_height_, 16);
+ }
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (j = 0; j < benchmark_iterations_; j++) {
+ DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt,
+ benchmark_width_, benchmark_width_, benchmark_height_, 16);
+ }
+
+ for (i = 0; i < y_plane_size; ++i) {
+ EXPECT_EQ(dst_c[i], dst_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(tile_y);
+ free_aligned_buffer_page_end(dst_c);
+ free_aligned_buffer_page_end(dst_opt);
+}
+
+// Compares DetileSplitUV to 2 step Detile + SplitUV
+TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
+ int i, j;
+
+ // orig is tiled. Allocate enough memory for tiles.
+ int tile_width = (benchmark_width_ + 15) & ~15;
+ int tile_height = (benchmark_height_ + 15) & ~15;
+ int tile_plane_size = tile_width * tile_height;
+ int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
+ align_buffer_page_end(tile_uv, tile_plane_size);
+ align_buffer_page_end(detiled_uv, tile_plane_size);
+ align_buffer_page_end(dst_u_two_stage, uv_plane_size);
+ align_buffer_page_end(dst_u_opt, uv_plane_size);
+ align_buffer_page_end(dst_v_two_stage, uv_plane_size);
+ align_buffer_page_end(dst_v_opt, uv_plane_size);
+
+ MemRandomize(tile_uv, tile_plane_size);
+ memset(detiled_uv, 0, tile_plane_size);
+ memset(dst_u_two_stage, 0, uv_plane_size);
+ memset(dst_u_opt, 0, uv_plane_size);
+ memset(dst_v_two_stage, 0, uv_plane_size);
+ memset(dst_v_opt, 0, uv_plane_size);
+
+ DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2,
+ dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
+ benchmark_height_, 16);
+
+ // Benchmark 2 step conversion for comparison.
+ for (j = 0; j < benchmark_iterations_; j++) {
+ DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_,
+ benchmark_width_, benchmark_height_, 16);
+ SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage,
+ (benchmark_width_ + 1) / 2, dst_v_two_stage,
+ (benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
+ benchmark_height_);
+ }
+
+ for (i = 0; i < uv_plane_size; ++i) {
+ EXPECT_EQ(dst_u_two_stage[i], dst_u_opt[i]);
+ EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(tile_uv);
+ free_aligned_buffer_page_end(detiled_uv);
+ free_aligned_buffer_page_end(dst_u_two_stage);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_two_stage);
+ free_aligned_buffer_page_end(dst_v_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
+ int i, j;
+
+ // orig is tiled. Allocate enough memory for tiles.
+ int tile_width = (benchmark_width_ + 15) & ~15;
+ int tile_height = (benchmark_height_ + 15) & ~15;
+ int tile_plane_size = tile_width * tile_height;
+ int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
+ align_buffer_page_end(tile_uv, tile_plane_size);
+ align_buffer_page_end(dst_u_c, uv_plane_size);
+ align_buffer_page_end(dst_u_opt, uv_plane_size);
+ align_buffer_page_end(dst_v_c, uv_plane_size);
+ align_buffer_page_end(dst_v_opt, uv_plane_size);
+
+ MemRandomize(tile_uv, tile_plane_size);
+ memset(dst_u_c, 0, uv_plane_size);
+ memset(dst_u_opt, 0, uv_plane_size);
+ memset(dst_v_c, 0, uv_plane_size);
+ memset(dst_v_opt, 0, uv_plane_size);
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags_);
+
+ DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2,
+ dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
+ benchmark_height_, 16);
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (j = 0; j < benchmark_iterations_; j++) {
+ DetileSplitUVPlane(
+ tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
+ (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
+ }
+
+ for (i = 0; i < uv_plane_size; ++i) {
+ EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);
+ EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(tile_uv);
+ free_aligned_buffer_page_end(dst_u_c);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_c);
+ free_aligned_buffer_page_end(dst_v_opt);
+}
+
static int TestMultiply(int width,
int height,
int benchmark_iterations,
@@ -1447,7 +1903,7 @@ static int TestMultiply(int width,
}
TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
- int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestMultiply(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
@@ -1522,7 +1978,7 @@ static int TestAdd(int width,
TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
int max_diff =
- TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ TestAdd(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
@@ -1595,7 +2051,7 @@ static int TestSubtract(int width,
}
TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
- int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestSubtract(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
@@ -1668,7 +2124,7 @@ static int TestSobel(int width,
TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
int max_diff =
- TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ TestSobel(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
@@ -1741,7 +2197,7 @@ static int TestSobelToPlane(int width,
}
TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
- int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestSobelToPlane(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
@@ -1813,7 +2269,7 @@ static int TestSobelXY(int width,
}
TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
- int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestSobelXY(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
@@ -1889,29 +2345,35 @@ static int TestBlur(int width,
return max_diff;
}
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+#define DISABLED_ARM(name) name
+#else
+#define DISABLED_ARM(name) DISABLED_##name
+#endif
+
static const int kBlurSize = 55;
-TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Any)) {
int max_diff =
- TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Unaligned)) {
int max_diff =
TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Invert)) {
int max_diff =
TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Opt)) {
int max_diff =
TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
@@ -1919,35 +2381,35 @@ TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
}
static const int kBlurSmallSize = 5;
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Any)) {
int max_diff =
- TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Unaligned)) {
int max_diff =
TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Invert)) {
int max_diff =
TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Opt)) {
int max_diff =
TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
-TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]);
@@ -2333,12 +2795,23 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
MaskCpuFlags(disable_cpu_flags_);
ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
benchmark_width_, benchmark_width_, benchmark_height_);
- MaskCpuFlags(benchmark_cpu_info_);
+ double c_time = get_time();
+ ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ c_time = (get_time() - c_time);
+ MaskCpuFlags(benchmark_cpu_info_);
+ ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i) {
ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
benchmark_width_, benchmark_width_, benchmark_height_);
}
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ // Report performance of C vs OPT
+ printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
+ static_cast<int>(opt_time * 1e6));
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
@@ -2361,12 +2834,24 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
MaskCpuFlags(disable_cpu_flags_);
ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
- MaskCpuFlags(benchmark_cpu_info_);
+ double c_time = get_time();
+ ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+ c_time = (get_time() - c_time);
+ MaskCpuFlags(benchmark_cpu_info_);
+ ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+ double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i) {
ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+
+ // Report performance of C vs OPT
+ printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
+ static_cast<int>(opt_time * 1e6));
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
@@ -2426,7 +2911,7 @@ static int TestARGBRect(int width,
}
TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
- int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0, 4);
EXPECT_EQ(0, max_diff);
@@ -2454,7 +2939,7 @@ TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
}
TEST_F(LibYUVPlanarTest, SetPlane_Any) {
- int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+ int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0, 1);
EXPECT_EQ(0, max_diff);
@@ -2483,33 +2968,24 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
- align_buffer_page_end(src_pixels, kPixels * 2);
- align_buffer_page_end(tmp_pixels_u, kPixels);
- align_buffer_page_end(tmp_pixels_v, kPixels);
+ align_buffer_page_end(src_pixels_u, kPixels);
+ align_buffer_page_end(src_pixels_v, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2);
- MemRandomize(src_pixels, kPixels * 2);
- MemRandomize(tmp_pixels_u, kPixels);
- MemRandomize(tmp_pixels_v, kPixels);
+ MemRandomize(src_pixels_u, kPixels);
+ MemRandomize(src_pixels_v, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 2);
MemRandomize(dst_pixels_c, kPixels * 2);
MaskCpuFlags(disable_cpu_flags_);
- SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
- tmp_pixels_v, benchmark_width_, benchmark_width_,
- benchmark_height_);
- MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
- SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
- tmp_pixels_v, benchmark_width_, benchmark_width_,
- benchmark_height_);
-
for (int i = 0; i < benchmark_iterations_; ++i) {
- MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
benchmark_height_);
}
@@ -2518,9 +2994,43 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
- free_aligned_buffer_page_end(src_pixels);
- free_aligned_buffer_page_end(tmp_pixels_u);
- free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(src_pixels_u);
+ free_aligned_buffer_page_end(src_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+// 16 bit channel split and merge
+TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_u, kPixels * 2);
+ align_buffer_page_end(src_pixels_v, kPixels * 2);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
+ align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
+ MemRandomize(src_pixels_u, kPixels * 2);
+ MemRandomize(src_pixels_v, kPixels * 2);
+ MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
+ MemRandomize(dst_pixels_c, kPixels * 2 * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
+ (const uint16_t*)src_pixels_v, benchmark_width_,
+ (uint16_t*)dst_pixels_c, benchmark_width_ * 2,
+ benchmark_width_, benchmark_height_, 12);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
+ (const uint16_t*)src_pixels_v, benchmark_width_,
+ (uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
+ benchmark_width_, benchmark_height_, 12);
+ }
+
+ for (int i = 0; i < kPixels * 2 * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels_u);
+ free_aligned_buffer_page_end(src_pixels_v);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
@@ -2528,47 +3038,112 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 2);
- align_buffer_page_end(tmp_pixels_u, kPixels);
- align_buffer_page_end(tmp_pixels_v, kPixels);
+ align_buffer_page_end(dst_pixels_u_c, kPixels);
+ align_buffer_page_end(dst_pixels_v_c, kPixels);
+ align_buffer_page_end(dst_pixels_u_opt, kPixels);
+ align_buffer_page_end(dst_pixels_v_opt, kPixels);
+
+ MemRandomize(src_pixels, kPixels * 2);
+ MemRandomize(dst_pixels_u_c, kPixels);
+ MemRandomize(dst_pixels_v_c, kPixels);
+ MemRandomize(dst_pixels_u_opt, kPixels);
+ MemRandomize(dst_pixels_v_opt, kPixels);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c,
+ benchmark_width_, dst_pixels_v_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt,
+ benchmark_width_, dst_pixels_v_opt, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+ EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_u_c);
+ free_aligned_buffer_page_end(dst_pixels_v_c);
+ free_aligned_buffer_page_end(dst_pixels_u_opt);
+ free_aligned_buffer_page_end(dst_pixels_v_opt);
+}
+
+// 16 bit channel split
+TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 2 * 2);
+ align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
+ align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
+ align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
+ MemRandomize(src_pixels, kPixels * 2 * 2);
+ MemRandomize(dst_pixels_u_c, kPixels * 2);
+ MemRandomize(dst_pixels_v_c, kPixels * 2);
+ MemRandomize(dst_pixels_u_opt, kPixels * 2);
+ MemRandomize(dst_pixels_v_opt, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+ (uint16_t*)dst_pixels_u_c, benchmark_width_,
+ (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
+ benchmark_height_, 10);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+ (uint16_t*)dst_pixels_u_opt, benchmark_width_,
+ (uint16_t*)dst_pixels_v_opt, benchmark_width_,
+ benchmark_width_, benchmark_height_, 10);
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+ EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_u_c);
+ free_aligned_buffer_page_end(dst_pixels_v_c);
+ free_aligned_buffer_page_end(dst_pixels_u_opt);
+ free_aligned_buffer_page_end(dst_pixels_v_opt);
+}
+
+TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
+ // Round count up to multiple of 16
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 2);
align_buffer_page_end(dst_pixels_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2);
MemRandomize(src_pixels, kPixels * 2);
- MemRandomize(tmp_pixels_u, kPixels);
- MemRandomize(tmp_pixels_v, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 2);
MemRandomize(dst_pixels_c, kPixels * 2);
MaskCpuFlags(disable_cpu_flags_);
- SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
- tmp_pixels_v, benchmark_width_, benchmark_width_,
- benchmark_height_);
- MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
- dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
- benchmark_height_);
+ SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
- SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u,
- benchmark_width_, tmp_pixels_v, benchmark_width_,
- benchmark_width_, benchmark_height_);
+ SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
}
- MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
- dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
- benchmark_height_);
for (int i = 0; i < kPixels * 2; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
- free_aligned_buffer_page_end(tmp_pixels_u);
- free_aligned_buffer_page_end(tmp_pixels_v);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
+ // Round count up to multiple of 16
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
@@ -2617,6 +3192,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
+ // Round count up to multiple of 16
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
@@ -2663,10 +3239,373 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
+TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 4);
+ align_buffer_page_end(tmp_pixels_r, kPixels);
+ align_buffer_page_end(tmp_pixels_g, kPixels);
+ align_buffer_page_end(tmp_pixels_b, kPixels);
+ align_buffer_page_end(tmp_pixels_a, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+ align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+ MemRandomize(src_pixels, kPixels * 4);
+ MemRandomize(tmp_pixels_r, kPixels);
+ MemRandomize(tmp_pixels_g, kPixels);
+ MemRandomize(tmp_pixels_b, kPixels);
+ MemRandomize(tmp_pixels_a, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 4);
+ MemRandomize(dst_pixels_c, kPixels * 4);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, tmp_pixels_a, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, tmp_pixels_a, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
+ benchmark_width_, tmp_pixels_b, benchmark_width_,
+ tmp_pixels_a, benchmark_width_, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_r);
+ free_aligned_buffer_page_end(tmp_pixels_g);
+ free_aligned_buffer_page_end(tmp_pixels_b);
+ free_aligned_buffer_page_end(tmp_pixels_a);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 4);
+ align_buffer_page_end(tmp_pixels_r, kPixels);
+ align_buffer_page_end(tmp_pixels_g, kPixels);
+ align_buffer_page_end(tmp_pixels_b, kPixels);
+ align_buffer_page_end(tmp_pixels_a, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+ align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+ MemRandomize(src_pixels, kPixels * 4);
+ MemRandomize(tmp_pixels_r, kPixels);
+ MemRandomize(tmp_pixels_g, kPixels);
+ MemRandomize(tmp_pixels_b, kPixels);
+ MemRandomize(tmp_pixels_a, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 4);
+ MemRandomize(dst_pixels_c, kPixels * 4);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, tmp_pixels_a, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, tmp_pixels_a,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ }
+
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_r);
+ free_aligned_buffer_page_end(tmp_pixels_g);
+ free_aligned_buffer_page_end(tmp_pixels_b);
+ free_aligned_buffer_page_end(tmp_pixels_a);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 4);
+ align_buffer_page_end(tmp_pixels_r, kPixels);
+ align_buffer_page_end(tmp_pixels_g, kPixels);
+ align_buffer_page_end(tmp_pixels_b, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+ align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+ MemRandomize(src_pixels, kPixels * 4);
+ MemRandomize(tmp_pixels_r, kPixels);
+ MemRandomize(tmp_pixels_g, kPixels);
+ MemRandomize(tmp_pixels_b, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 4);
+ MemRandomize(dst_pixels_c, kPixels * 4);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, NULL, 0, benchmark_width_,
+ benchmark_height_);
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, NULL, 0, benchmark_width_,
+ benchmark_height_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
+ benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0,
+ dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_r);
+ free_aligned_buffer_page_end(tmp_pixels_g);
+ free_aligned_buffer_page_end(tmp_pixels_b);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 4);
+ align_buffer_page_end(tmp_pixels_r, kPixels);
+ align_buffer_page_end(tmp_pixels_g, kPixels);
+ align_buffer_page_end(tmp_pixels_b, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+ align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+ MemRandomize(src_pixels, kPixels * 4);
+ MemRandomize(tmp_pixels_r, kPixels);
+ MemRandomize(tmp_pixels_g, kPixels);
+ MemRandomize(tmp_pixels_b, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 4);
+ MemRandomize(dst_pixels_c, kPixels * 4);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, NULL, 0, benchmark_width_,
+ benchmark_height_);
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_,
+ benchmark_height_);
+ }
+
+ MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_r);
+ free_aligned_buffer_page_end(tmp_pixels_g);
+ free_aligned_buffer_page_end(tmp_pixels_b);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+// Merge 4 channels
+#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
+ const int kWidth = W1280; \
+ const int kPixels = kWidth * benchmark_height_; \
+ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
+ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
+ MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF); \
+ memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \
+ memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \
+ STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
+ STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
+ STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
+ STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF); \
+ DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
+ DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+ kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \
+ kWidth, NEG benchmark_height_, DEPTH); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+ kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \
+ kWidth, NEG benchmark_height_, DEPTH); \
+ } \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_memory_r); \
+ free_aligned_buffer_page_end(src_memory_g); \
+ free_aligned_buffer_page_end(src_memory_b); \
+ free_aligned_buffer_page_end(src_memory_a); \
+ free_aligned_buffer_page_end(dst_memory_c); \
+ free_aligned_buffer_page_end(dst_memory_opt); \
+ }
+
+// Merge 3 channel RGB into 4 channel XRGB with opaque alpha
+#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \
+ const int kWidth = W1280; \
+ const int kPixels = kWidth * benchmark_height_; \
+ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
+ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
+ MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
+ memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \
+ memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \
+ STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
+ STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
+ STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
+ DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
+ DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+ kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \
+ NEG benchmark_height_, DEPTH); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+ kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth, \
+ NEG benchmark_height_, DEPTH); \
+ } \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_memory_r); \
+ free_aligned_buffer_page_end(src_memory_g); \
+ free_aligned_buffer_page_end(src_memory_b); \
+ free_aligned_buffer_page_end(dst_memory_c); \
+ free_aligned_buffer_page_end(dst_memory_opt); \
+ }
+
+#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
+ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \
+ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
+ 2) \
+ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
+ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) \
+ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, \
+ 0) \
+ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
+ 2) \
+ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
+ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
+
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10)
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12)
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
+
+#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
+ const int kWidth = W1280; \
+ const int kPixels = kWidth * benchmark_height_; \
+ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
+ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
+ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
+ MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
+ MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
+ STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
+ STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
+ STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
+ DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
+ DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
+ memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
+ memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+ kWidth, dst_pixels_c, kWidth * 4, kWidth, \
+ NEG benchmark_height_, DEPTH); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+ kWidth, dst_pixels_opt, kWidth * 4, kWidth, \
+ NEG benchmark_height_, DEPTH); \
+ } \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_memory_r); \
+ free_aligned_buffer_page_end(src_memory_g); \
+ free_aligned_buffer_page_end(src_memory_b); \
+ free_aligned_buffer_page_end(dst_memory_c); \
+ free_aligned_buffer_page_end(dst_memory_opt); \
+ }
+
+#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
+ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \
+ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
+ 2) \
+ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
+ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
+
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10)
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12)
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
+
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 8
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@@ -2679,19 +3618,19 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
MergeUVRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_u),
reinterpret_cast<const uint16_t*>(src_pixels_v),
- reinterpret_cast<uint16_t*>(dst_pixels_uv_c), 64, kPixels);
+ reinterpret_cast<uint16_t*>(dst_pixels_uv_c), 16, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
MergeUVRow_16_AVX2(reinterpret_cast<const uint16_t*>(src_pixels_u),
reinterpret_cast<const uint16_t*>(src_pixels_v),
- reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 64,
+ reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 16,
kPixels);
} else {
MergeUVRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_u),
reinterpret_cast<const uint16_t*>(src_pixels_v),
- reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 64,
+ reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 16,
kPixels);
}
}
@@ -2710,7 +3649,9 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_MULTIPLYROW_16_AVX2
TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
- const int kPixels = benchmark_width_ * benchmark_height_;
+ // Round count up to multiple of 32
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2776,6 +3717,65 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
free_aligned_buffer_page_end(dst_pixels_y_c);
}
+TEST_F(LibYUVPlanarTest, YUY2ToY) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_y, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels);
+ align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+ MemRandomize(src_pixels_y, kPixels * 2);
+ memset(dst_pixels_y_opt, 0, kPixels);
+ memset(dst_pixels_y_c, 1, kPixels);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ YUY2ToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ YUY2ToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_opt,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
+TEST_F(LibYUVPlanarTest, UYVYToY) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_y, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels);
+ align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+ MemRandomize(src_pixels_y, kPixels * 2);
+ memset(dst_pixels_y_opt, 0, kPixels);
+ memset(dst_pixels_y_c, 1, kPixels);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ UYVYToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ UYVYToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_opt,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
+#ifdef ENABLE_ROW_TESTS
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT16TO8ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
@@ -2822,6 +3822,36 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
}
#endif // HAS_CONVERT16TO8ROW_AVX2
+#ifdef HAS_UYVYTOYROW_NEON
+TEST_F(LibYUVPlanarTest, UYVYToYRow_Opt) {
+ // NEON does multiple of 16, so round count up
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+ align_buffer_page_end(src_pixels_y, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels);
+ align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+ MemRandomize(src_pixels_y, kPixels * 2);
+ memset(dst_pixels_y_opt, 0, kPixels);
+ memset(dst_pixels_y_c, 1, kPixels);
+
+ UYVYToYRow_C(src_pixels_y, dst_pixels_y_c, kPixels);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ UYVYToYRow_NEON(src_pixels_y, dst_pixels_y_opt, kPixels);
+ }
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif // HAS_UYVYTOYROW_NEON
+
+#endif // ENABLE_ROW_TESTS
+
TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels);
@@ -2855,6 +3885,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
free_aligned_buffer_page_end(dst_pixels_y_c);
}
+#ifdef ENABLE_ROW_TESTS
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT8TO16ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
@@ -3173,33 +4204,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
- SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
- SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
- SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
+ SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
+ SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+ SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
- for (int i = 0; i < 640 + 4; ++i) {
+ for (int i = 0; i < 1280 + 8; ++i) {
orig_pixels[i] = i * 256;
}
- GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
- for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+ GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
- GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else {
- GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
}
#else
- GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif
}
- for (int i = 0; i < 640; ++i) {
+ for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
@@ -3225,141 +4256,139 @@ extern "C" void GaussCol_C(const uint16_t* src0,
int width);
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
- SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
- SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
- SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
+ SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
+ SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
+ SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
- for (int i = 0; i < 640 * 5; ++i) {
- orig_pixels[i] = i;
+ for (int i = 0; i < 1280 * 5; ++i) {
+ orig_pixels[i] = static_cast<float>(i);
}
- GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
- 640);
- for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+ GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
+ 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
- GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4],
- &dst_pixels_opt[0], 640);
+ GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
} else {
- GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4],
- &dst_pixels_opt[0], 640);
+ GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
}
#else
- GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
- &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
- 640);
+ GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
#endif
}
- for (int i = 0; i < 640; ++i) {
+ for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
-
- EXPECT_EQ(dst_pixels_c[0],
- static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
- 640 * 4 * 1));
- EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
}
-float TestFloatDivToByte(int benchmark_width,
- int benchmark_height,
- int benchmark_iterations,
- float scale,
- bool opt) {
- int i, j;
- // NEON does multiple of 8, so round count up
- const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
- align_buffer_page_end(src_weights, kPixels * 4);
- align_buffer_page_end(src_values, kPixels * 4);
- align_buffer_page_end(dst_out_c, kPixels);
- align_buffer_page_end(dst_out_opt, kPixels);
- align_buffer_page_end(dst_mask_c, kPixels);
- align_buffer_page_end(dst_mask_opt, kPixels);
-
- // Randomize works but may contain some denormals affecting performance.
- // MemRandomize(orig_y, kPixels * 4);
- // large values are problematic. audio is really -1 to 1.
- for (i = 0; i < kPixels; ++i) {
- (reinterpret_cast<float*>(src_weights))[i] = scale;
- (reinterpret_cast<float*>(src_values))[i] =
- sinf(static_cast<float>(i) * 0.1f);
- }
- memset(dst_out_c, 0, kPixels);
- memset(dst_out_opt, 1, kPixels);
- memset(dst_mask_c, 2, kPixels);
- memset(dst_mask_opt, 3, kPixels);
+TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
+ SIMD_ALIGNED(float orig_pixels[1280 + 4]);
+ SIMD_ALIGNED(float dst_pixels_c[1280]);
+ SIMD_ALIGNED(float dst_pixels_opt[1280]);
- FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
- reinterpret_cast<float*>(src_values), dst_out_c,
- dst_mask_c, kPixels);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
- for (j = 0; j < benchmark_iterations; j++) {
- if (opt) {
-#ifdef HAS_FLOATDIVTOBYTEROW_NEON
- FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
- reinterpret_cast<float*>(src_values), dst_out_opt,
- dst_mask_opt, kPixels);
-#else
- FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
- reinterpret_cast<float*>(src_values), dst_out_opt,
- dst_mask_opt, kPixels);
-#endif
+ for (int i = 0; i < 1280 + 4; ++i) {
+ orig_pixels[i] = static_cast<float>(i);
+ }
+ GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else {
- FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
- reinterpret_cast<float*>(src_values), dst_out_opt,
- dst_mask_opt, kPixels);
+ GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
}
+#else
+ GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+#endif
}
- uint8_t max_diff = 0;
- for (i = 0; i < kPixels; ++i) {
- uint8_t abs_diff = abs(dst_out_c[i] - dst_out_opt[i]) +
- abs(dst_mask_c[i] - dst_mask_opt[i]);
- if (abs_diff > max_diff) {
- max_diff = abs_diff;
- }
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
+}
- free_aligned_buffer_page_end(src_weights);
- free_aligned_buffer_page_end(src_values);
- free_aligned_buffer_page_end(dst_out_c);
- free_aligned_buffer_page_end(dst_out_opt);
- free_aligned_buffer_page_end(dst_mask_c);
- free_aligned_buffer_page_end(dst_mask_opt);
+TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
+ SIMD_ALIGNED(float dst_pixels_c[1280]);
+ SIMD_ALIGNED(float dst_pixels_opt[1280]);
+ align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows
+ float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
- return max_diff;
-}
+ memset(orig_pixels, 0, 1280 * 5 * 4);
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
-TEST_F(LibYUVPlanarTest, TestFloatDivToByte_C) {
- float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
- benchmark_iterations_, 1.2f, false);
- EXPECT_EQ(0, diff);
-}
+ for (int i = 0; i < 1280 * 5; ++i) {
+ orig_pixels[i] = static_cast<float>(i);
+ }
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+ &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+ &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+ } else {
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+ &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+ &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+ }
+#else
+ GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+ &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+ &dst_pixels_opt[0], 1280);
+#endif
+ }
-TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
- float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
- benchmark_iterations_, 1.2f, true);
- EXPECT_EQ(0, diff);
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(orig_pixels_buf);
}
-TEST_F(LibYUVPlanarTest, UVToVURow) {
+TEST_F(LibYUVPlanarTest, SwapUVRow) {
const int kPixels = benchmark_width_ * benchmark_height_;
+ void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ SwapUVRow_C;
+
align_buffer_page_end(src_pixels_vu, kPixels * 2);
align_buffer_page_end(dst_pixels_uv, kPixels * 2);
-
MemRandomize(src_pixels_vu, kPixels * 2);
memset(dst_pixels_uv, 1, kPixels * 2);
- UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels);
+#if defined(HAS_SWAPUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SwapUVRow = SwapUVRow_Any_NEON;
+ if (IS_ALIGNED(kPixels, 16)) {
+ SwapUVRow = SwapUVRow_NEON;
+ }
+ }
+#endif
+ for (int j = 0; j < benchmark_iterations_; j++) {
+ SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels);
+ }
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
@@ -3368,5 +4397,223 @@ TEST_F(LibYUVPlanarTest, UVToVURow) {
free_aligned_buffer_page_end(src_pixels_vu);
free_aligned_buffer_page_end(dst_pixels_uv);
}
+#endif // ENABLE_ROW_TESTS
+
+TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
+ const int kSize = benchmark_width_ * benchmark_height_ * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ align_buffer_page_end(dst_pixels_opt, kSize);
+ align_buffer_page_end(dst_pixels_c, kSize);
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
+ }
+ memset(dst_pixels_opt, 1, kSize);
+ memset(dst_pixels_c, 2, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+ (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+ (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ }
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+ << i;
+ }
+
+ free_aligned_buffer_page_end(dst_pixels_c);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+ int dst_width = (benchmark_width_ + 1) / 2;
+ int dst_height = (benchmark_height_ + 1) / 2;
+ align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+ align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+ align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+ align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+ MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+ MemRandomize(tmp_pixels_u, dst_width * dst_height);
+ MemRandomize(tmp_pixels_v, dst_width * dst_height);
+ MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+ MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+ benchmark_width_, dst_pixels_uv_c, dst_width * 2,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+ benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+ benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+ EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_u);
+ free_aligned_buffer_page_end(src_pixels_v);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_uv_opt);
+ free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
+TEST_F(LibYUVPlanarTest, NV12Copy) {
+ const int halfwidth = (benchmark_width_ + 1) >> 1;
+ const int halfheight = (benchmark_height_ + 1) >> 1;
+ align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_uv, halfwidth * 2 * halfheight);
+ align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight);
+
+ MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_uv, halfwidth * 2 * halfheight);
+ MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(dst_uv, halfwidth * 2 * halfheight);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y,
+ benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(src_y[i], dst_y[i]);
+ }
+ for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+ EXPECT_EQ(src_uv[i], dst_uv[i]);
+ }
+
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_uv);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVPlanarTest, NV21Copy) {
+ const int halfwidth = (benchmark_width_ + 1) >> 1;
+ const int halfheight = (benchmark_height_ + 1) >> 1;
+ align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_vu, halfwidth * 2 * halfheight);
+ align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight);
+
+ MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_vu, halfwidth * 2 * halfheight);
+ MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(dst_vu, halfwidth * 2 * halfheight);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y,
+ benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(src_y[i], dst_y[i]);
+ }
+ for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+ EXPECT_EQ(src_vu[i], dst_vu[i]);
+ }
+
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_vu);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \
+ defined(__aarch64__)
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
+ int i, j;
+ const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+ align_buffer_page_end(orig_f, y_plane_size * 4);
+ align_buffer_page_end(orig_y, y_plane_size * 2);
+ align_buffer_page_end(dst_opt, y_plane_size * 4);
+ align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+ for (i = 0; i < y_plane_size; ++i) {
+ ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+ }
+ memset(orig_y, 1, y_plane_size * 2);
+ memset(dst_opt, 2, y_plane_size * 4);
+ memset(rec_opt, 3, y_plane_size * 2);
+
+ ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+ y_plane_size);
+
+ for (j = 0; j < benchmark_iterations_; j++) {
+ ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt,
+ y_plane_size);
+ }
+
+ ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+ y_plane_size);
+
+ for (i = 0; i < y_plane_size; ++i) {
+ EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_f);
+ free_aligned_buffer_page_end(orig_y);
+ free_aligned_buffer_page_end(dst_opt);
+ free_aligned_buffer_page_end(rec_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) {
+ int i, j;
+ const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+ align_buffer_page_end(orig_f, y_plane_size * 4);
+ align_buffer_page_end(orig_y, y_plane_size * 2);
+ align_buffer_page_end(dst_opt, y_plane_size * 4);
+ align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+ for (i = 0; i < y_plane_size; ++i) {
+ ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+ }
+ memset(orig_y, 1, y_plane_size * 2);
+ memset(dst_opt, 2, y_plane_size * 4);
+ memset(rec_opt, 3, y_plane_size * 2);
+
+ ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+ y_plane_size);
+
+ for (j = 0; j < benchmark_iterations_; j++) {
+ ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt,
+ y_plane_size);
+ }
+
+ ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+ y_plane_size);
+
+ for (i = 0; i < y_plane_size; ++i) {
+ EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_f);
+ free_aligned_buffer_page_end(orig_y);
+ free_aligned_buffer_page_end(dst_opt);
+ free_aligned_buffer_page_end(rec_opt);
+}
+
+#endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
} // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc
index d2003895..74952c4e 100644
--- a/files/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@@ -156,31 +156,179 @@ TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
- TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
- TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
- TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
- TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+ int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+ align_buffer_page_end(src_argb, argb_plane_size);
+ align_buffer_page_end(dst_argb, argb_plane_size);
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_, kRotate0));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+ benchmark_height_, kRotate0));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_, kRotate180));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+ benchmark_height_, kRotate180));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_,
+ benchmark_height_, kRotate90));
+
+ EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_ - 1,
+ benchmark_height_, kRotate90));
+
+ EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_,
+ benchmark_height_, kRotate270));
+
+ EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+ abs(benchmark_height_) * 4, benchmark_width_ - 1,
+ benchmark_height_, kRotate270));
+
+ free_aligned_buffer_page_end(dst_argb);
+ free_aligned_buffer_page_end(src_argb);
+}
+
+static void TestRotatePlane_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height < 1) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_stride = src_width;
+ int src_plane_size = src_stride * abs(src_height);
+ align_buffer_page_end_16(src, src_plane_size);
+ for (int i = 0; i < src_plane_size; ++i) {
+ src[i] = fastrand() & 0xff;
+ }
+
+ int dst_stride = dst_width;
+ int dst_plane_size = dst_stride * dst_height;
+ align_buffer_page_end_16(dst_c, dst_plane_size);
+ align_buffer_page_end_16(dst_opt, dst_plane_size);
+ memset(dst_c, 2, dst_plane_size);
+ memset(dst_opt, 3, dst_plane_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ RotatePlane_16(src, src_stride, dst_c, dst_stride, src_width, src_height,
+ mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ RotatePlane_16(src, src_stride, dst_opt, dst_stride, src_width, src_height,
+ mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_plane_size; ++i) {
+ EXPECT_EQ(dst_c[i], dst_opt[i]);
+ }
+
+ free_aligned_buffer_page_end_16(dst_c);
+ free_aligned_buffer_page_end_16(dst_opt);
+ free_aligned_buffer_page_end_16(src);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Opt) {
+ TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Opt) {
+ TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Opt) {
+ TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Opt) {
+ TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Odd) {
+ TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Odd) {
+ TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Odd) {
+ TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Odd) {
+ TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
} // namespace libyuv
diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc
new file mode 100644
index 00000000..abc08efa
--- /dev/null
+++ b/unit_test/rotate_test.cc
@@ -0,0 +1,962 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/rotate_row.h"
+#endif
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+static void I420TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i420_y_size = src_width * Abs(src_height);
+ int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+ int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
+ align_buffer_page_end(src_i420, src_i420_size);
+ for (int i = 0; i < src_i420_size; ++i) {
+ src_i420[i] = fastrand() & 0xff;
+ }
+
+ int dst_i420_y_size = dst_width * dst_height;
+ int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+ align_buffer_page_end(dst_i420_c, dst_i420_size);
+ align_buffer_page_end(dst_i420_opt, dst_i420_size);
+ memset(dst_i420_c, 2, dst_i420_size);
+ memset(dst_i420_opt, 3, dst_i420_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
+ (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
+ (src_width + 1) / 2, dst_i420_c, dst_width,
+ dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I420Rotate(
+ src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
+ src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+ dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
+ (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i420_size; ++i) {
+ EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i420_c);
+ free_aligned_buffer_page_end(dst_i420_opt);
+ free_aligned_buffer_page_end(src_i420);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
+ I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
+ I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
+ I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
+ I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+static void I422TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i422_y_size = src_width * Abs(src_height);
+ int src_i422_uv_size = ((src_width + 1) / 2) * Abs(src_height);
+ int src_i422_size = src_i422_y_size + src_i422_uv_size * 2;
+ align_buffer_page_end(src_i422, src_i422_size);
+ for (int i = 0; i < src_i422_size; ++i) {
+ src_i422[i] = fastrand() & 0xff;
+ }
+
+ int dst_i422_y_size = dst_width * dst_height;
+ int dst_i422_uv_size = ((dst_width + 1) / 2) * dst_height;
+ int dst_i422_size = dst_i422_y_size + dst_i422_uv_size * 2;
+ align_buffer_page_end(dst_i422_c, dst_i422_size);
+ align_buffer_page_end(dst_i422_opt, dst_i422_size);
+ memset(dst_i422_c, 2, dst_i422_size);
+ memset(dst_i422_opt, 3, dst_i422_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I422Rotate(src_i422, src_width, src_i422 + src_i422_y_size,
+ (src_width + 1) / 2, src_i422 + src_i422_y_size + src_i422_uv_size,
+ (src_width + 1) / 2, dst_i422_c, dst_width,
+ dst_i422_c + dst_i422_y_size, (dst_width + 1) / 2,
+ dst_i422_c + dst_i422_y_size + dst_i422_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I422Rotate(
+ src_i422, src_width, src_i422 + src_i422_y_size, (src_width + 1) / 2,
+ src_i422 + src_i422_y_size + src_i422_uv_size, (src_width + 1) / 2,
+ dst_i422_opt, dst_width, dst_i422_opt + dst_i422_y_size,
+ (dst_width + 1) / 2, dst_i422_opt + dst_i422_y_size + dst_i422_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i422_size; ++i) {
+ EXPECT_EQ(dst_i422_c[i], dst_i422_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i422_c);
+ free_aligned_buffer_page_end(dst_i422_opt);
+ free_aligned_buffer_page_end(src_i422);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate0_Opt) {
+ I422TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate90_Opt) {
+ I422TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate180_Opt) {
+ I422TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate270_Opt) {
+ I422TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I444TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i444_y_size = src_width * Abs(src_height);
+ int src_i444_uv_size = src_width * Abs(src_height);
+ int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+ align_buffer_page_end(src_i444, src_i444_size);
+ for (int i = 0; i < src_i444_size; ++i) {
+ src_i444[i] = fastrand() & 0xff;
+ }
+
+ int dst_i444_y_size = dst_width * dst_height;
+ int dst_i444_uv_size = dst_width * dst_height;
+ int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+ align_buffer_page_end(dst_i444_c, dst_i444_size);
+ align_buffer_page_end(dst_i444_opt, dst_i444_size);
+ memset(dst_i444_c, 2, dst_i444_size);
+ memset(dst_i444_opt, 3, dst_i444_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+ src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+ dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+ dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+ src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+ dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+ dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+ dst_width, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i444_size; ++i) {
+ EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i444_c);
+ free_aligned_buffer_page_end(dst_i444_opt);
+ free_aligned_buffer_page_end(src_i444);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+ I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+ I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+ I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+ I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+static void NV12TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) { // allow negative for inversion test.
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_nv12_y_size = src_width * Abs(src_height);
+ int src_nv12_uv_size =
+ ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
+ int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
+ align_buffer_page_end(src_nv12, src_nv12_size);
+ for (int i = 0; i < src_nv12_size; ++i) {
+ src_nv12[i] = fastrand() & 0xff;
+ }
+
+ int dst_i420_y_size = dst_width * dst_height;
+ int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+ align_buffer_page_end(dst_i420_c, dst_i420_size);
+ align_buffer_page_end(dst_i420_opt, dst_i420_size);
+ memset(dst_i420_c, 2, dst_i420_size);
+ memset(dst_i420_opt, 3, dst_i420_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+ (src_width + 1) & ~1, dst_i420_c, dst_width,
+ dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+ (src_width + 1) & ~1, dst_i420_opt, dst_width,
+ dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i420_size; ++i) {
+ EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i420_c);
+ free_aligned_buffer_page_end(dst_i420_opt);
+ free_aligned_buffer_page_end(src_nv12);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
+ NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
+ NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
+ NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
+ NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+ benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// Test Android 420 to I420 Rotate
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF, PN, OFF_U, OFF_V, ROT) \
+ TEST_F(LibYUVRotateTest, \
+ SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate##ROT##To##PN##N) { \
+ const int kWidth = W1280; \
+ const int kHeight = benchmark_height_; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ uint8_t* src_u = src_uv + OFF_U; \
+ uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
+ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+ kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight, \
+ (libyuv::RotationMode)ROT); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \
+ dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight, \
+ (libyuv::RotationMode)ROT); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \
+ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1, \
+ _Any, +, 0, PN, OFF_U, OFF_V, 0) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \
+ _Unaligned, +, 2, PN, OFF_U, OFF_V, 0) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+ -, 0, PN, OFF_U, OFF_V, 0) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+ 0, PN, OFF_U, OFF_V, 0) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+ 0, PN, OFF_U, OFF_V, 180)
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+#undef TESTAPLANARTOP
+#undef TESTAPLANARTOPI
+
+static void I010TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i010_y_size = src_width * Abs(src_height);
+ int src_i010_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+ int src_i010_size = src_i010_y_size + src_i010_uv_size * 2;
+ align_buffer_page_end_16(src_i010, src_i010_size);
+ for (int i = 0; i < src_i010_size; ++i) {
+ src_i010[i] = fastrand() & 0x3ff;
+ }
+
+ int dst_i010_y_size = dst_width * dst_height;
+ int dst_i010_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i010_size = dst_i010_y_size + dst_i010_uv_size * 2;
+ align_buffer_page_end_16(dst_i010_c, dst_i010_size);
+ align_buffer_page_end_16(dst_i010_opt, dst_i010_size);
+ memset(dst_i010_c, 2, dst_i010_size * 2);
+ memset(dst_i010_opt, 3, dst_i010_size * 2);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I010Rotate(src_i010, src_width, src_i010 + src_i010_y_size,
+ (src_width + 1) / 2, src_i010 + src_i010_y_size + src_i010_uv_size,
+ (src_width + 1) / 2, dst_i010_c, dst_width,
+ dst_i010_c + dst_i010_y_size, (dst_width + 1) / 2,
+ dst_i010_c + dst_i010_y_size + dst_i010_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I010Rotate(
+ src_i010, src_width, src_i010 + src_i010_y_size, (src_width + 1) / 2,
+ src_i010 + src_i010_y_size + src_i010_uv_size, (src_width + 1) / 2,
+ dst_i010_opt, dst_width, dst_i010_opt + dst_i010_y_size,
+ (dst_width + 1) / 2, dst_i010_opt + dst_i010_y_size + dst_i010_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i010_size; ++i) {
+ EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]);
+ }
+
+ free_aligned_buffer_page_end_16(dst_i010_c);
+ free_aligned_buffer_page_end_16(dst_i010_opt);
+ free_aligned_buffer_page_end_16(src_i010);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate0_Opt) {
+ I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate90_Opt) {
+ I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate180_Opt) {
+ I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate270_Opt) {
+ I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I210TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i210_y_size = src_width * Abs(src_height);
+ int src_i210_uv_size = ((src_width + 1) / 2) * Abs(src_height);
+ int src_i210_size = src_i210_y_size + src_i210_uv_size * 2;
+ align_buffer_page_end_16(src_i210, src_i210_size);
+ for (int i = 0; i < src_i210_size; ++i) {
+ src_i210[i] = fastrand() & 0x3ff;
+ }
+
+ int dst_i210_y_size = dst_width * dst_height;
+ int dst_i210_uv_size = ((dst_width + 1) / 2) * dst_height;
+ int dst_i210_size = dst_i210_y_size + dst_i210_uv_size * 2;
+ align_buffer_page_end_16(dst_i210_c, dst_i210_size);
+ align_buffer_page_end_16(dst_i210_opt, dst_i210_size);
+ memset(dst_i210_c, 2, dst_i210_size * 2);
+ memset(dst_i210_opt, 3, dst_i210_size * 2);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I210Rotate(src_i210, src_width, src_i210 + src_i210_y_size,
+ (src_width + 1) / 2, src_i210 + src_i210_y_size + src_i210_uv_size,
+ (src_width + 1) / 2, dst_i210_c, dst_width,
+ dst_i210_c + dst_i210_y_size, (dst_width + 1) / 2,
+ dst_i210_c + dst_i210_y_size + dst_i210_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I210Rotate(
+ src_i210, src_width, src_i210 + src_i210_y_size, (src_width + 1) / 2,
+ src_i210 + src_i210_y_size + src_i210_uv_size, (src_width + 1) / 2,
+ dst_i210_opt, dst_width, dst_i210_opt + dst_i210_y_size,
+ (dst_width + 1) / 2, dst_i210_opt + dst_i210_y_size + dst_i210_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i210_size; ++i) {
+ EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]);
+ }
+
+ free_aligned_buffer_page_end_16(dst_i210_c);
+ free_aligned_buffer_page_end_16(dst_i210_opt);
+ free_aligned_buffer_page_end_16(src_i210);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate0_Opt) {
+ I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate90_Opt) {
+ I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate180_Opt) {
+ I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate270_Opt) {
+ I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I410TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i410_y_size = src_width * Abs(src_height);
+ int src_i410_uv_size = src_width * Abs(src_height);
+ int src_i410_size = src_i410_y_size + src_i410_uv_size * 2;
+ align_buffer_page_end_16(src_i410, src_i410_size);
+ for (int i = 0; i < src_i410_size; ++i) {
+ src_i410[i] = fastrand() & 0x3ff;
+ }
+
+ int dst_i410_y_size = dst_width * dst_height;
+ int dst_i410_uv_size = dst_width * dst_height;
+ int dst_i410_size = dst_i410_y_size + dst_i410_uv_size * 2;
+ align_buffer_page_end_16(dst_i410_c, dst_i410_size);
+ align_buffer_page_end_16(dst_i410_opt, dst_i410_size);
+ memset(dst_i410_c, 2, dst_i410_size * 2);
+ memset(dst_i410_opt, 3, dst_i410_size * 2);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+ src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+ dst_i410_c, dst_width, dst_i410_c + dst_i410_y_size, dst_width,
+ dst_i410_c + dst_i410_y_size + dst_i410_uv_size, dst_width,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+ src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+ dst_i410_opt, dst_width, dst_i410_opt + dst_i410_y_size,
+ dst_width, dst_i410_opt + dst_i410_y_size + dst_i410_uv_size,
+ dst_width, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i410_size; ++i) {
+ EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]);
+ }
+
+ free_aligned_buffer_page_end_16(dst_i410_c);
+ free_aligned_buffer_page_end_16(dst_i410_opt);
+ free_aligned_buffer_page_end_16(src_i410);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate0_Opt) {
+ I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate90_Opt) {
+ I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate180_Opt) {
+ I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
+ I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+#if defined(ENABLE_ROW_TESTS)
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
+ // dst width and height
+ const int width = 4;
+ const int height = 4;
+ int src_pixels[4][4];
+ int dst_pixels_c[4][4];
+ int dst_pixels_opt[4][4];
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ src_pixels[i][j] = i * 10 + j;
+ }
+ }
+ memset(dst_pixels_c, 1, width * height * 4);
+ memset(dst_pixels_opt, 2, width * height * 4);
+
+ Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_c, width * 4, width);
+
+ const int benchmark_iterations =
+ (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
+ (4 * 4);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ } else
+#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ } else
+#endif
+ {
+ Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ }
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+ EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+ }
+ }
+}
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
+ // dst width and height
+ const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
+ const int height = 4;
+ align_buffer_page_end(src_pixels, height * width * 4);
+ align_buffer_page_end(dst_pixels_c, width * height * 4);
+ align_buffer_page_end(dst_pixels_opt, width * height * 4);
+
+ MemRandomize(src_pixels, height * width * 4);
+ memset(dst_pixels_c, 1, width * height * 4);
+ memset(dst_pixels_opt, 2, width * height * 4);
+
+ Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_c, width * 4, width);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ } else
+#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ } else if (TestCpuFlag(kCpuHasSSE2)) {
+ Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ } else
+#endif
+ {
+ Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+ (uint8_t*)dst_pixels_opt, width * 4, width);
+ }
+ }
+
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_c);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+}
+
+#endif // ENABLE_ROW_TESTS
+
+} // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 94aef60e..f54a68f1 100644
--- a/files/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -22,6 +22,12 @@ namespace libyuv {
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int ARGBTestFilter(int src_width,
int src_height,
@@ -114,8 +120,8 @@ static int ARGBTestFilter(int src_width,
return max_diff;
}
-static const int kTileX = 8;
-static const int kTileY = 8;
+static const int kTileX = 64;
+static const int kTileY = 64;
static int TileARGBScale(const uint8_t* src_argb,
int src_stride_argb,
@@ -232,7 +238,7 @@ static int ARGBClipTestFilter(int src_width,
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
int diff = ARGBTestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
@@ -241,7 +247,7 @@ static int ARGBClipTestFilter(int src_width,
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
+ TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \
int diff = ARGBClipTestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
@@ -251,15 +257,30 @@ static int ARGBClipTestFilter(int src_width,
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom) \
- TEST_FACTOR1(name, None, nom, denom, 0) \
- TEST_FACTOR1(name, Linear, nom, denom, 3) \
- TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
- TEST_FACTOR1(name, Box, nom, denom, 3)
+#ifndef DISABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(, name, None, nom, denom, 0) \
+ TEST_FACTOR1(, name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(, name, Box, nom, denom, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \
+ TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, 3)
+#else
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3)
+#endif
+#endif
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
+#ifndef DISABLE_SLOW_TESTS
TEST_FACTOR(8, 1, 8)
+#endif
TEST_FACTOR(3by4, 3, 4)
TEST_FACTOR(3by8, 3, 8)
TEST_FACTOR(3, 1, 3)
@@ -268,7 +289,7 @@ TEST_FACTOR(3, 1, 3)
#undef SX
#undef DX
-#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \
height, kFilter##filter, benchmark_iterations_, \
@@ -282,34 +303,70 @@ TEST_FACTOR(3, 1, 3)
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##name##ClipTo##width##x##height##_##filter) { \
int diff = \
ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##name##ClipFrom##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
-/// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height) \
- TEST_SCALETO1(name, width, height, None, 0) \
- TEST_SCALETO1(name, width, height, Linear, 3) \
- TEST_SCALETO1(name, width, height, Bilinear, 3)
+#ifndef DISABLE_SLOW_TESTS
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(, name, width, height, None, 0) \
+ TEST_SCALETO1(, name, width, height, Linear, 3) \
+ TEST_SCALETO1(, name, width, height, Bilinear, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3)
+#endif
+#endif
TEST_SCALETO(ARGBScale, 1, 1)
-TEST_SCALETO(ARGBScale, 320, 240)
TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(ARGBScale, 320, 240)
TEST_SCALETO(ARGBScale, 1280, 720)
TEST_SCALETO(ARGBScale, 1920, 1080)
+#endif // DISABLE_SLOW_TESTS
#undef TEST_SCALETO1
#undef TEST_SCALETO
+#define TEST_SCALESWAPXY1(name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \
+ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(ARGBScale, None, 0)
+TEST_SCALESWAPXY1(ARGBScale, Linear, 0)
+TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0)
+#else
+TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0)
+#endif
+#undef TEST_SCALESWAPXY1
+
// Scale with YUV conversion to ARGB and clipping.
// TODO(fbarchard): Add fourcc support. All 4 ARGB formats is easy to support.
LIBYUV_API
@@ -454,4 +511,78 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
EXPECT_LE(diff, 10);
}
+TEST_F(LibYUVScaleTest, ARGBTest3x) {
+ const int kSrcStride = 480 * 4;
+ const int kDstStride = 160 * 4;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 480 * 3; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = 255 - i;
+ orig_pixels[i * 4 + 2] = i + 1;
+ orig_pixels[i * 4 + 3] = i + 10;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(225, dest_pixels[0]);
+ EXPECT_EQ(255 - 225, dest_pixels[1]);
+ EXPECT_EQ(226, dest_pixels[2]);
+ EXPECT_EQ(235, dest_pixels[3]);
+
+ ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterNone);
+
+ EXPECT_EQ(225, dest_pixels[0]);
+ EXPECT_EQ(255 - 225, dest_pixels[1]);
+ EXPECT_EQ(226, dest_pixels[2]);
+ EXPECT_EQ(235, dest_pixels[3]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, ARGBTest4x) {
+ const int kSrcStride = 640 * 4;
+ const int kDstStride = 160 * 4;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 640 * 4; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = 255 - i;
+ orig_pixels[i * 4 + 2] = i + 1;
+ orig_pixels[i * 4 + 3] = i + 10;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_NEAR(66, dest_pixels[0], 4);
+ EXPECT_NEAR(255 - 66, dest_pixels[1], 4);
+ EXPECT_NEAR(67, dest_pixels[2], 4);
+ EXPECT_NEAR(76, dest_pixels[3], 4);
+
+ ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+ kFilterNone);
+
+ EXPECT_EQ(2, dest_pixels[0]);
+ EXPECT_EQ(255 - 2, dest_pixels[1]);
+ EXPECT_EQ(3, dest_pixels[2]);
+ EXPECT_EQ(12, dest_pixels[3]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
} // namespace libyuv
diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc
new file mode 100644
index 00000000..9ce47a02
--- /dev/null
+++ b/unit_test/scale_plane_test.cc
@@ -0,0 +1,470 @@
+/*
+ * Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/scale_row.h" // For ScaleRowDown2Box_Odd_C
+#endif
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+#ifdef ENABLE_ROW_TESTS
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
+ SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
+ SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
+ SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
+ memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ if (!has_ssse3) {
+ printf("Warning SSSE3 not detected; Skipping test.\n");
+ } else {
+ // TL.
+ orig_pixels[0] = 255u;
+ orig_pixels[1] = 0u;
+ orig_pixels[128 + 0] = 0u;
+ orig_pixels[128 + 1] = 0u;
+ // TR.
+ orig_pixels[2] = 0u;
+ orig_pixels[3] = 100u;
+ orig_pixels[128 + 2] = 0u;
+ orig_pixels[128 + 3] = 0u;
+ // BL.
+ orig_pixels[4] = 0u;
+ orig_pixels[5] = 0u;
+ orig_pixels[128 + 4] = 50u;
+ orig_pixels[128 + 5] = 0u;
+ // BR.
+ orig_pixels[6] = 0u;
+ orig_pixels[7] = 0u;
+ orig_pixels[128 + 6] = 0u;
+ orig_pixels[128 + 7] = 20u;
+ // Odd.
+ orig_pixels[126] = 4u;
+ orig_pixels[127] = 255u;
+ orig_pixels[128 + 126] = 16u;
+ orig_pixels[128 + 127] = 255u;
+
+ // Test regular half size.
+ ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(133u, dst_pixels_c[63]);
+
+ // Test Odd width version - Last pixel is just 1 horizontal pixel.
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(10u, dst_pixels_c[63]);
+
+ // Test one pixel less, should skip the last pixel.
+ memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(0u, dst_pixels_c[63]);
+
+ // Test regular half size SSSE3.
+ ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+
+ EXPECT_EQ(64u, dst_pixels_opt[0]);
+ EXPECT_EQ(25u, dst_pixels_opt[1]);
+ EXPECT_EQ(13u, dst_pixels_opt[2]);
+ EXPECT_EQ(5u, dst_pixels_opt[3]);
+ EXPECT_EQ(0u, dst_pixels_opt[4]);
+ EXPECT_EQ(133u, dst_pixels_opt[63]);
+
+ // Compare C and SSSE3 match.
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+ ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+ for (int i = 0; i < 64; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ }
+}
+#endif // HAS_SCALEROWDOWN2_SSSE3
+
+extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
+ SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
+ SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+ SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
+
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+ for (int i = 0; i < 2560 * 2; ++i) {
+ orig_pixels[i] = i;
+ }
+ ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+ } else {
+ ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+ }
+#else
+ ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+#endif
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
+ EXPECT_EQ(dst_pixels_c[1279], 3839);
+}
+#endif // ENABLE_ROW_TESTS
+
+// Test scaling plane with 8 bit C vs 12 bit C and return maximum pixel
+// difference.
+// 0 = exact.
+static int TestPlaneFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int src_stride_y = Abs(src_width);
+ int dst_y_plane_size = dst_width * dst_height;
+ int dst_stride_y = dst_width;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+ uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+ uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+
+ MemRandomize(src_y, src_y_plane_size);
+ memset(dst_y_8, 0, dst_y_plane_size);
+ memset(dst_y_16, 1, dst_y_plane_size * 2);
+
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_16[i] = src_y[i] & 255;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
+ dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+
+ for (i = 0; i < benchmark_iterations; ++i) {
+ ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
+ dst_stride_y, dst_width, dst_height, f);
+ }
+
+ // Expect an exact match.
+ int max_diff = 0;
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_y_16);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_y_16);
+
+ return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample.
+#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \
+ int diff = TestPlaneFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance. Takes 90 seconds.
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 0)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+TEST_F(LibYUVScaleTest, PlaneTest3x) {
+ const int kSrcStride = 480;
+ const int kDstStride = 160;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 480 * 3; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(225, dest_pixels[0]);
+
+ ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterNone);
+
+ EXPECT_EQ(225, dest_pixels[0]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest4x) {
+ const int kSrcStride = 640;
+ const int kDstStride = 160;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 640 * 4; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(66, dest_pixels[0]);
+
+ ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+ kFilterNone);
+
+ EXPECT_EQ(2, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_, benchmark_height_,
+ benchmark_width_, kFilterNone);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+ benchmark_height_, dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_, kFilterNone);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_, benchmark_height_,
+ benchmark_width_, kFilterBilinear);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+ benchmark_height_, dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_, kFilterBilinear);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_, benchmark_height_,
+ benchmark_width_, kFilterBox);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+ benchmark_height_, dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_, kFilterBox);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest1_Box) {
+ align_buffer_page_end(orig_pixels, 3);
+ align_buffer_page_end(dst_pixels, 3);
+
+ // Pad the 1x1 byte image with invalid values before and after in case libyuv
+ // reads outside the memory boundaries.
+ orig_pixels[0] = 0;
+ orig_pixels[1] = 1; // scale this pixel
+ orig_pixels[2] = 2;
+ dst_pixels[0] = 3;
+ dst_pixels[1] = 3;
+ dst_pixels[2] = 3;
+
+ libyuv::ScalePlane(orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
+ /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
+ /* dst_width= */ 1, /* dst_height= */ 2,
+ libyuv::kFilterBox);
+
+ EXPECT_EQ(dst_pixels[0], 1);
+ EXPECT_EQ(dst_pixels[1], 1);
+ EXPECT_EQ(dst_pixels[2], 3);
+
+ free_aligned_buffer_page_end(dst_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
+ align_buffer_page_end(orig_pixels_alloc, 3 * 2);
+ align_buffer_page_end(dst_pixels_alloc, 3 * 2);
+ uint16_t* orig_pixels = (uint16_t*)orig_pixels_alloc;
+ uint16_t* dst_pixels = (uint16_t*)dst_pixels_alloc;
+
+ // Pad the 1x1 byte image with invalid values before and after in case libyuv
+ // reads outside the memory boundaries.
+ orig_pixels[0] = 0;
+ orig_pixels[1] = 1; // scale this pixel
+ orig_pixels[2] = 2;
+ dst_pixels[0] = 3;
+ dst_pixels[1] = 3;
+ dst_pixels[2] = 3;
+
+ libyuv::ScalePlane_16(
+ orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
+ /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
+ /* dst_width= */ 1, /* dst_height= */ 2, libyuv::kFilterNone);
+
+ EXPECT_EQ(dst_pixels[0], 1);
+ EXPECT_EQ(dst_pixels[1], 1);
+ EXPECT_EQ(dst_pixels[2], 3);
+
+ free_aligned_buffer_page_end(dst_pixels_alloc);
+ free_aligned_buffer_page_end(orig_pixels_alloc);
+}
+} // namespace libyuv
diff --git a/unit_test/scale_rgb_test.cc b/unit_test/scale_rgb_test.cc
new file mode 100644
index 00000000..8296abe3
--- /dev/null
+++ b/unit_test/scale_rgb_test.cc
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_rgb.h"
+
+namespace libyuv {
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int RGBTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ const int b = 0; // 128 to test for padding/stride.
+ int64_t src_rgb_plane_size =
+ (Abs(src_width) + b * 3) * (Abs(src_height) + b * 3) * 3LL;
+ int src_stride_rgb = (b * 3 + Abs(src_width)) * 3;
+
+ align_buffer_page_end(src_rgb, src_rgb_plane_size);
+ if (!src_rgb) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_rgb, src_rgb_plane_size);
+
+ int64_t dst_rgb_plane_size = (dst_width + b * 3) * (dst_height + b * 3) * 3LL;
+ int dst_stride_rgb = (b * 3 + dst_width) * 3;
+
+ align_buffer_page_end(dst_rgb_c, dst_rgb_plane_size);
+ align_buffer_page_end(dst_rgb_opt, dst_rgb_plane_size);
+ if (!dst_rgb_c || !dst_rgb_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ memset(dst_rgb_c, 2, dst_rgb_plane_size);
+ memset(dst_rgb_opt, 3, dst_rgb_plane_size);
+
+ // Warm up both versions for consistent benchmarks.
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+ src_height, dst_rgb_c + (dst_stride_rgb * b) + b * 3, dst_stride_rgb,
+ dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+ src_height, dst_rgb_opt + (dst_stride_rgb * b) + b * 3,
+ dst_stride_rgb, dst_width, dst_height, f);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+ src_height, dst_rgb_c + (dst_stride_rgb * b) + b * 3, dst_stride_rgb,
+ dst_width, dst_height, f);
+
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+ src_height, dst_rgb_opt + (dst_stride_rgb * b) + b * 3,
+ dst_stride_rgb, dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 3; j < (dst_width + b) * 3; ++j) {
+ int abs_diff = Abs(dst_rgb_c[(i * dst_stride_rgb) + j] -
+ dst_rgb_opt[(i * dst_stride_rgb) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_rgb_c);
+ free_aligned_buffer_page_end(dst_rgb_opt);
+ free_aligned_buffer_page_end(src_rgb);
+ return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, RGBScaleDownBy##name##_##filter) { \
+ int diff = RGBTestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(name, Box, nom, denom, 3)
+#else
+// Test a scale factor with Bilinear.
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, 3)
+#endif
+
+TEST_FACTOR(2, 1, 2)
+#ifndef DISABLE_SLOW_TESTS
+TEST_FACTOR(4, 1, 4)
+// TEST_FACTOR(8, 1, 8) Disable for benchmark performance.
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#endif
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
+ int diff = RGBTestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
+ int diff = RGBTestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3)
+#endif
+
+TEST_SCALETO(RGBScale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(RGBScale, 1, 1)
+TEST_SCALETO(RGBScale, 256, 144) /* 128x72 * 3 */
+TEST_SCALETO(RGBScale, 320, 240)
+TEST_SCALETO(RGBScale, 569, 480)
+TEST_SCALETO(RGBScale, 1280, 720)
+TEST_SCALETO(RGBScale, 1920, 1080)
+#endif // DISABLE_SLOW_TESTS
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+#define TEST_SCALESWAPXY1(name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \
+ int diff = RGBTestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(RGBScale, None, 0)
+TEST_SCALESWAPXY1(RGBScale, Linear, 0)
+TEST_SCALESWAPXY1(RGBScale, Bilinear, 0)
+#else
+TEST_SCALESWAPXY1(RGBScale, Bilinear, 0)
+#endif
+#undef TEST_SCALESWAPXY1
+
+TEST_F(LibYUVScaleTest, RGBTest3x) {
+ const int kSrcStride = 480 * 3;
+ const int kDstStride = 160 * 3;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 480 * 3; ++i) {
+ orig_pixels[i * 3 + 0] = i;
+ orig_pixels[i * 3 + 1] = 255 - i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ RGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(225, dest_pixels[0]);
+ EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+ RGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterNone);
+
+ EXPECT_EQ(225, dest_pixels[0]);
+ EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, RGBTest4x) {
+ const int kSrcStride = 640 * 3;
+ const int kDstStride = 160 * 3;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 640 * 4; ++i) {
+ orig_pixels[i * 3 + 0] = i;
+ orig_pixels[i * 3 + 1] = 255 - i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ RGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(66, dest_pixels[0]);
+ EXPECT_EQ(190, dest_pixels[1]);
+
+ RGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(2, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
+ EXPECT_EQ(255 - 2, dest_pixels[1]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 811b2d04..6e3b9271 100644
--- a/files/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -14,11 +14,25 @@
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
#include "libyuv/scale_row.h" // For ScaleRowDown2Box_Odd_C
+#endif
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#endif
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
@@ -139,6 +153,123 @@ static int I420TestFilter(int src_width,
return max_diff;
}
+// Test scaling with 8 bit C vs 12 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I420TestFilter_12(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int src_width_uv = (Abs(src_width) + 1) >> 1;
+ int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ align_buffer_page_end(src_y_12, src_y_plane_size * 2);
+ align_buffer_page_end(src_u_12, src_uv_plane_size * 2);
+ align_buffer_page_end(src_v_12, src_uv_plane_size * 2);
+ if (!src_y || !src_u || !src_v || !src_y_12 || !src_u_12 || !src_v_12) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ uint16_t* p_src_y_12 = reinterpret_cast<uint16_t*>(src_y_12);
+ uint16_t* p_src_u_12 = reinterpret_cast<uint16_t*>(src_u_12);
+ uint16_t* p_src_v_12 = reinterpret_cast<uint16_t*>(src_v_12);
+
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_12[i] = src_y[i];
+ }
+ for (i = 0; i < src_uv_plane_size; ++i) {
+ p_src_u_12[i] = src_u[i];
+ p_src_v_12[i] = src_v[i];
+ }
+
+ int dst_width_uv = (dst_width + 1) >> 1;
+ int dst_height_uv = (dst_height + 1) >> 1;
+
+ int dst_y_plane_size = (dst_width) * (dst_height);
+ int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
+
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_12, dst_y_plane_size * 2);
+ align_buffer_page_end(dst_u_12, dst_uv_plane_size * 2);
+ align_buffer_page_end(dst_v_12, dst_uv_plane_size * 2);
+
+ uint16_t* p_dst_y_12 = reinterpret_cast<uint16_t*>(dst_y_12);
+ uint16_t* p_dst_u_12 = reinterpret_cast<uint16_t*>(dst_u_12);
+ uint16_t* p_dst_v_12 = reinterpret_cast<uint16_t*>(dst_v_12);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+ dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I420Scale_12(p_src_y_12, src_stride_y, p_src_u_12, src_stride_uv,
+ p_src_v_12, src_stride_uv, src_width, src_height, p_dst_y_12,
+ dst_stride_y, p_dst_u_12, dst_stride_uv, p_dst_v_12,
+ dst_stride_uv, dst_width, dst_height, f);
+ }
+
+ // Expect an exact match.
+ int max_diff = 0;
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_12[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ for (i = 0; i < dst_uv_plane_size; ++i) {
+ int abs_diff = Abs(dst_u_8[i] - p_dst_u_12[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_8[i] - p_dst_v_12[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_u_8);
+ free_aligned_buffer_page_end(dst_v_8);
+ free_aligned_buffer_page_end(dst_y_12);
+ free_aligned_buffer_page_end(dst_u_12);
+ free_aligned_buffer_page_end(dst_v_12);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+ free_aligned_buffer_page_end(src_y_12);
+ free_aligned_buffer_page_end(src_u_12);
+ free_aligned_buffer_page_end(src_v_12);
+
+ return max_diff;
+}
+
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
static int I420TestFilter_16(int src_width,
@@ -374,6 +505,123 @@ static int I444TestFilter(int src_width,
return max_diff;
}
+// Test scaling with 8 bit C vs 12 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_12(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int src_width_uv = Abs(src_width);
+ int src_height_uv = Abs(src_height);
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ align_buffer_page_end(src_y_12, src_y_plane_size * 2);
+ align_buffer_page_end(src_u_12, src_uv_plane_size * 2);
+ align_buffer_page_end(src_v_12, src_uv_plane_size * 2);
+ if (!src_y || !src_u || !src_v || !src_y_12 || !src_u_12 || !src_v_12) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ uint16_t* p_src_y_12 = reinterpret_cast<uint16_t*>(src_y_12);
+ uint16_t* p_src_u_12 = reinterpret_cast<uint16_t*>(src_u_12);
+ uint16_t* p_src_v_12 = reinterpret_cast<uint16_t*>(src_v_12);
+
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_12[i] = src_y[i];
+ }
+ for (i = 0; i < src_uv_plane_size; ++i) {
+ p_src_u_12[i] = src_u[i];
+ p_src_v_12[i] = src_v[i];
+ }
+
+ int dst_width_uv = dst_width;
+ int dst_height_uv = dst_height;
+
+ int dst_y_plane_size = (dst_width) * (dst_height);
+ int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
+
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_12, dst_y_plane_size * 2);
+ align_buffer_page_end(dst_u_12, dst_uv_plane_size * 2);
+ align_buffer_page_end(dst_v_12, dst_uv_plane_size * 2);
+
+ uint16_t* p_dst_y_12 = reinterpret_cast<uint16_t*>(dst_y_12);
+ uint16_t* p_dst_u_12 = reinterpret_cast<uint16_t*>(dst_u_12);
+ uint16_t* p_dst_v_12 = reinterpret_cast<uint16_t*>(dst_v_12);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+ dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I444Scale_12(p_src_y_12, src_stride_y, p_src_u_12, src_stride_uv,
+ p_src_v_12, src_stride_uv, src_width, src_height, p_dst_y_12,
+ dst_stride_y, p_dst_u_12, dst_stride_uv, p_dst_v_12,
+ dst_stride_uv, dst_width, dst_height, f);
+ }
+
+ // Expect an exact match.
+ int max_diff = 0;
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_12[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ for (i = 0; i < dst_uv_plane_size; ++i) {
+ int abs_diff = Abs(dst_u_8[i] - p_dst_u_12[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_8[i] - p_dst_v_12[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_u_8);
+ free_aligned_buffer_page_end(dst_v_8);
+ free_aligned_buffer_page_end(dst_y_12);
+ free_aligned_buffer_page_end(dst_u_12);
+ free_aligned_buffer_page_end(dst_v_12);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+ free_aligned_buffer_page_end(src_y_12);
+ free_aligned_buffer_page_end(src_u_12);
+ free_aligned_buffer_page_end(src_v_12);
+
+ return max_diff;
+}
+
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
static int I444TestFilter_16(int src_width,
@@ -491,57 +739,185 @@ static int I444TestFilter_16(int src_width,
return max_diff;
}
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int NV12TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ int src_width_uv = (Abs(src_width) + 1) >> 1;
+ int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv * 2;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_uv, src_uv_plane_size);
+ if (!src_y || !src_uv) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_uv, src_uv_plane_size);
+
+ int dst_width_uv = (dst_width + 1) >> 1;
+ int dst_height_uv = (dst_height + 1) >> 1;
+
+ int64_t dst_y_plane_size = (dst_width) * (dst_height);
+ int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv * 2;
+
+ align_buffer_page_end(dst_y_c, dst_y_plane_size);
+ align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+ align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+ if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+ dst_y_c, dst_stride_y, dst_uv_c, dst_stride_uv, dst_width,
+ dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+ dst_y_opt, dst_stride_y, dst_uv_opt, dst_stride_uv, dst_width,
+ dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+ // Report performance of C vs OPT.
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference is not
+ // over 3.
+ int max_diff = 0;
+ for (i = 0; i < (dst_height); ++i) {
+ for (j = 0; j < (dst_width); ++j) {
+ int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ for (i = 0; i < (dst_height_uv); ++i) {
+ for (j = 0; j < (dst_width_uv * 2); ++j) {
+ int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
+ dst_uv_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_c);
+ free_aligned_buffer_page_end(dst_uv_c);
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_uv_opt);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_uv);
+
+ return max_diff;
+}
+
// The following adjustments in dimensions ensure the scale factor will be
// exactly achieved.
// 2 is chroma subsample.
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \
- int diff = I420TestFilter( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \
- int diff = I444TestFilter( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) { \
- int diff = I420TestFilter_16( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) { \
- int diff = I444TestFilter_16( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \
+ int diff = I420TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \
+ int diff = I444TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_12) { \
+ int diff = I420TestFilter_12( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_12) { \
+ int diff = I444TestFilter_12( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) { \
+ int diff = NV12TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff) \
- TEST_FACTOR1(name, None, nom, denom, 0) \
- TEST_FACTOR1(name, Linear, nom, denom, 3) \
- TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
- TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#ifndef DISABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(, name, None, nom, denom, 0) \
+ TEST_FACTOR1(, name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(, name, Box, nom, denom, boxdiff)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \
+ TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#else
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#endif
+#endif
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
+#ifndef DISABLE_SLOW_TESTS
TEST_FACTOR(8, 1, 8, 0)
+#endif
TEST_FACTOR(3by4, 3, 4, 1)
TEST_FACTOR(3by8, 3, 8, 1)
TEST_FACTOR(3, 1, 3, 0)
@@ -550,7 +926,7 @@ TEST_FACTOR(3, 1, 3, 0)
#undef SX
#undef DX
-#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \
int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \
height, kFilter##filter, benchmark_iterations_, \
@@ -563,18 +939,40 @@ TEST_FACTOR(3, 1, 3, 0)
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I420##name##To##width##x##height##_##filter##_12) { \
+ int diff = I420TestFilter_12( \
+ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I444##name##To##width##x##height##_##filter##_12) { \
+ int diff = I444TestFilter_12( \
+ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \
int diff = I420TestFilter_16( \
benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \
int diff = I444TestFilter_16( \
benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
+ TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \
+ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) { \
int diff = I420TestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
@@ -590,7 +988,23 @@ TEST_FACTOR(3, 1, 3, 0)
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
- I420##name##From##width##x##height##_##filter##_16) { \
+ DISABLED_##I420##name##From##width##x##height##_##filter##_12) { \
+ int diff = I420TestFilter_12(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I444##name##From##width##x##height##_##filter##_12) { \
+ int diff = I444TestFilter_12(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \
int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, \
@@ -598,307 +1012,122 @@ TEST_FACTOR(3, 1, 3, 0)
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
- I444##name##From##width##x##height##_##filter##_16) { \
+ DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \
int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \
+ int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
}
+#ifndef DISABLE_SLOW_TESTS
// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height) \
- TEST_SCALETO1(name, width, height, None, 0) \
- TEST_SCALETO1(name, width, height, Linear, 3) \
- TEST_SCALETO1(name, width, height, Bilinear, 3) \
- TEST_SCALETO1(name, width, height, Box, 3)
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(, name, width, height, None, 0) \
+ TEST_SCALETO1(, name, width, height, Linear, 3) \
+ TEST_SCALETO1(, name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(, name, width, height, Box, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#endif
+#endif
TEST_SCALETO(Scale, 1, 1)
-TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 1280, 720)
TEST_SCALETO(Scale, 1920, 1080)
+#endif // DISABLE_SLOW_TESTS
#undef TEST_SCALETO1
#undef TEST_SCALETO
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
- SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
- SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
- SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
- memset(orig_pixels, 0, sizeof(orig_pixels));
- memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
- memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-
- int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
- if (!has_ssse3) {
- printf("Warning SSSE3 not detected; Skipping test.\n");
- } else {
- // TL.
- orig_pixels[0] = 255u;
- orig_pixels[1] = 0u;
- orig_pixels[128 + 0] = 0u;
- orig_pixels[128 + 1] = 0u;
- // TR.
- orig_pixels[2] = 0u;
- orig_pixels[3] = 100u;
- orig_pixels[128 + 2] = 0u;
- orig_pixels[128 + 3] = 0u;
- // BL.
- orig_pixels[4] = 0u;
- orig_pixels[5] = 0u;
- orig_pixels[128 + 4] = 50u;
- orig_pixels[128 + 5] = 0u;
- // BR.
- orig_pixels[6] = 0u;
- orig_pixels[7] = 0u;
- orig_pixels[128 + 6] = 0u;
- orig_pixels[128 + 7] = 20u;
- // Odd.
- orig_pixels[126] = 4u;
- orig_pixels[127] = 255u;
- orig_pixels[128 + 126] = 16u;
- orig_pixels[128 + 127] = 255u;
-
- // Test regular half size.
- ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
-
- EXPECT_EQ(64u, dst_pixels_c[0]);
- EXPECT_EQ(25u, dst_pixels_c[1]);
- EXPECT_EQ(13u, dst_pixels_c[2]);
- EXPECT_EQ(5u, dst_pixels_c[3]);
- EXPECT_EQ(0u, dst_pixels_c[4]);
- EXPECT_EQ(133u, dst_pixels_c[63]);
-
- // Test Odd width version - Last pixel is just 1 horizontal pixel.
- ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-
- EXPECT_EQ(64u, dst_pixels_c[0]);
- EXPECT_EQ(25u, dst_pixels_c[1]);
- EXPECT_EQ(13u, dst_pixels_c[2]);
- EXPECT_EQ(5u, dst_pixels_c[3]);
- EXPECT_EQ(0u, dst_pixels_c[4]);
- EXPECT_EQ(10u, dst_pixels_c[63]);
-
- // Test one pixel less, should skip the last pixel.
- memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
- ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
-
- EXPECT_EQ(64u, dst_pixels_c[0]);
- EXPECT_EQ(25u, dst_pixels_c[1]);
- EXPECT_EQ(13u, dst_pixels_c[2]);
- EXPECT_EQ(5u, dst_pixels_c[3]);
- EXPECT_EQ(0u, dst_pixels_c[4]);
- EXPECT_EQ(0u, dst_pixels_c[63]);
-
- // Test regular half size SSSE3.
- ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-
- EXPECT_EQ(64u, dst_pixels_opt[0]);
- EXPECT_EQ(25u, dst_pixels_opt[1]);
- EXPECT_EQ(13u, dst_pixels_opt[2]);
- EXPECT_EQ(5u, dst_pixels_opt[3]);
- EXPECT_EQ(0u, dst_pixels_opt[4]);
- EXPECT_EQ(133u, dst_pixels_opt[63]);
-
- // Compare C and SSSE3 match.
- ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
- ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
- for (int i = 0; i < 64; ++i) {
- EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
- }
- }
-}
-#endif // HAS_SCALEROWDOWN2_SSSE3
-
-extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-
-TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
- SIMD_ALIGNED(uint16_t orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun.
- SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
- SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
-
- memset(orig_pixels, 0, sizeof(orig_pixels));
- memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt));
- memset(dst_pixels_c, 2, sizeof(dst_pixels_c));
-
- for (int i = 0; i < 640 * 2 + 1; ++i) {
- orig_pixels[i] = i;
- }
- ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
- for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
- int has_neon = TestCpuFlag(kCpuHasNEON);
- if (has_neon) {
- ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
- } else {
- ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
- }
-#elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
- int has_mmi = TestCpuFlag(kCpuHasMMI);
- if (has_mmi) {
- ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
- } else {
- ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
- }
-#else
- ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-#endif
- }
-
- for (int i = 0; i < 1280; ++i) {
- EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+#define TEST_SCALESWAPXY1(DISABLED_, name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, I420##name##SwapXY_##filter) { \
+ int diff = I420TestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) { \
+ int diff = I444TestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_12) { \
+ int diff = I420TestFilter_12(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_12) { \
+ int diff = I444TestFilter_12(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) { \
+ int diff = I420TestFilter_16(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) { \
+ int diff = I444TestFilter_16(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) { \
+ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
}
- EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
- EXPECT_EQ(dst_pixels_c[1279], 800);
-}
-extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst,
- int dst_width);
-
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
- SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
- SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
- SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
-
- memset(orig_pixels, 0, sizeof(orig_pixels));
- memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
- memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
-
- for (int i = 0; i < 2560 * 2; ++i) {
- orig_pixels[i] = i;
- }
- ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
- for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
- int has_neon = TestCpuFlag(kCpuHasNEON);
- if (has_neon) {
- ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
- } else {
- ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
- }
+// Test scale to a specified size with all 4 filters.
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALESWAPXY1(, Scale, None, 0)
+TEST_SCALESWAPXY1(, Scale, Linear, 3)
+TEST_SCALESWAPXY1(, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(, Scale, Box, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+TEST_SCALESWAPXY1(DISABLED_, Scale, None, 0)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Linear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3)
#else
- ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3)
#endif
- }
-
- for (int i = 0; i < 1280; ++i) {
- EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
- }
-
- EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
- EXPECT_EQ(dst_pixels_c[1279], 3839);
-}
-
-// Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel
-// difference.
-// 0 = exact.
-static int TestPlaneFilter_16(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- FilterMode f,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
- if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
- return 0;
- }
-
- int i;
- int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
- int src_stride_y = Abs(src_width);
- int dst_y_plane_size = dst_width * dst_height;
- int dst_stride_y = dst_width;
-
- align_buffer_page_end(src_y, src_y_plane_size);
- align_buffer_page_end(src_y_16, src_y_plane_size * 2);
- align_buffer_page_end(dst_y_8, dst_y_plane_size);
- align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
- uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
- uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
-
- MemRandomize(src_y, src_y_plane_size);
- memset(dst_y_8, 0, dst_y_plane_size);
- memset(dst_y_16, 1, dst_y_plane_size * 2);
-
- for (i = 0; i < src_y_plane_size; ++i) {
- p_src_y_16[i] = src_y[i] & 255;
- }
-
- MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
- dst_width, dst_height, f);
- MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
-
- for (i = 0; i < benchmark_iterations; ++i) {
- ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
- dst_stride_y, dst_width, dst_height, f);
- }
-
- // Expect an exact match.
- int max_diff = 0;
- for (i = 0; i < dst_y_plane_size; ++i) {
- int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
- if (abs_diff > max_diff) {
- max_diff = abs_diff;
- }
- }
-
- free_aligned_buffer_page_end(dst_y_8);
- free_aligned_buffer_page_end(dst_y_16);
- free_aligned_buffer_page_end(src_y);
- free_aligned_buffer_page_end(src_y_16);
-
- return max_diff;
-}
-
-// The following adjustments in dimensions ensure the scale factor will be
-// exactly achieved.
-// 2 is chroma subsample.
-#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
-#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) { \
- int diff = TestPlaneFilter_16( \
- SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- }
-
-// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
-// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff) \
- TEST_FACTOR1(name, None, nom, denom, 0) \
- TEST_FACTOR1(name, Linear, nom, denom, boxdiff) \
- TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
- TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#endif
+#undef TEST_SCALESWAPXY1
-TEST_FACTOR(2, 1, 2, 0)
-TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
-TEST_FACTOR(3by4, 3, 4, 1)
-TEST_FACTOR(3by8, 3, 8, 1)
-TEST_FACTOR(3, 1, 3, 0)
-#undef TEST_FACTOR1
-#undef TEST_FACTOR
-#undef SX
-#undef DX
} // namespace libyuv
diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc
new file mode 100644
index 00000000..dab217c9
--- /dev/null
+++ b/unit_test/scale_uv_test.cc
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_uv.h"
+
+namespace libyuv {
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int UVTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL;
+ int src_stride_uv = Abs(src_width) * 2;
+ int64_t dst_uv_plane_size = dst_width * dst_height * 2LL;
+ int dst_stride_uv = dst_width * 2;
+
+ align_buffer_page_end(src_uv, src_uv_plane_size);
+ align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+
+ if (!src_uv || !dst_uv_c || !dst_uv_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_uv, src_uv_plane_size);
+ memset(dst_uv_c, 2, dst_uv_plane_size);
+ memset(dst_uv_opt, 123, dst_uv_plane_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_c, dst_stride_uv,
+ dst_width, dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_opt,
+ dst_stride_uv, dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ int max_diff = 0;
+ for (i = 0; i < dst_uv_plane_size; ++i) {
+ int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_uv_c);
+ free_aligned_buffer_page_end(dst_uv_opt);
+ free_aligned_buffer_page_end(src_uv);
+ return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom) \
+ TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \
+ int diff = UVTestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_EQ(0, diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test a scale factor with all 4 filters. Expect exact for SIMD vs C.
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(name, None, nom, denom) \
+ TEST_FACTOR1(name, Linear, nom, denom) \
+ TEST_FACTOR1(name, Bilinear, nom, denom) \
+ TEST_FACTOR1(name, Box, nom, denom)
+#else
+// Test a scale factor with Bilinear.
+#define TEST_FACTOR(name, nom, denom) TEST_FACTOR1(name, Bilinear, nom, denom)
+#endif
+
+TEST_FACTOR(2, 1, 2)
+TEST_FACTOR(4, 1, 4)
+// TEST_FACTOR(8, 1, 8) Disable for benchmark performance.
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
+ int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
+ int diff = UVTestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3)
+#endif
+
+TEST_SCALETO(UVScale, 1, 1)
+TEST_SCALETO(UVScale, 569, 480)
+TEST_SCALETO(UVScale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(UVScale, 320, 240)
+TEST_SCALETO(UVScale, 1280, 720)
+TEST_SCALETO(UVScale, 1920, 1080)
+#endif // DISABLE_SLOW_TESTS
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+#define TEST_SCALESWAPXY1(name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \
+ int diff = \
+ UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_, \
+ benchmark_width_, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(UVScale, None, 0)
+TEST_SCALESWAPXY1(UVScale, Linear, 0)
+TEST_SCALESWAPXY1(UVScale, Bilinear, 0)
+#else
+TEST_SCALESWAPXY1(UVScale, Bilinear, 0)
+#endif
+#undef TEST_SCALESWAPXY1
+
+TEST_F(LibYUVScaleTest, UVTest3x) {
+ const int kSrcStride = 480 * 2;
+ const int kDstStride = 160 * 2;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 480 * 3; ++i) {
+ orig_pixels[i * 2 + 0] = i;
+ orig_pixels[i * 2 + 1] = 255 - i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(225, dest_pixels[0]);
+ EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+ UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+ kFilterNone);
+
+ EXPECT_EQ(225, dest_pixels[0]);
+ EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, UVTest4x) {
+ const int kSrcStride = 640 * 2;
+ const int kDstStride = 160 * 2;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 640 * 4; ++i) {
+ orig_pixels[i * 2 + 0] = i;
+ orig_pixels[i * 2 + 1] = 255 - i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+ benchmark_iterations_;
+ for (int i = 0; i < iterations160; ++i) {
+ UVScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(66, dest_pixels[0]);
+ EXPECT_EQ(190, dest_pixels[1]);
+
+ UVScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(2, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
+ EXPECT_EQ(255 - 2, dest_pixels[1]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/testdata/arm_v7.txt b/unit_test/testdata/arm_v7.txt
index 5d7dbd04..5d7dbd04 100644
--- a/files/unit_test/testdata/arm_v7.txt
+++ b/unit_test/testdata/arm_v7.txt
diff --git a/files/unit_test/testdata/juno.txt b/unit_test/testdata/juno.txt
index dd465272..dd465272 100644
--- a/files/unit_test/testdata/juno.txt
+++ b/unit_test/testdata/juno.txt
diff --git a/unit_test/testdata/mips.txt b/unit_test/testdata/mips.txt
new file mode 100644
index 00000000..d9f28cbf
--- /dev/null
+++ b/unit_test/testdata/mips.txt
@@ -0,0 +1,7 @@
+system type : generic-loongson-machine
+machine : loongson,generic
+processor : 0
+
+isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented : vz
+shadow register sets : 1
diff --git a/unit_test/testdata/mips_loongson2k.txt b/unit_test/testdata/mips_loongson2k.txt
new file mode 100644
index 00000000..8a88d38f
--- /dev/null
+++ b/unit_test/testdata/mips_loongson2k.txt
@@ -0,0 +1,5 @@
+system type : Loongson2K-SBC
+machine : loongson,LS2k1000-EVP
+processor : 0
+cpu model : Loongson-2K V0.3 FPU V0.1
+BogoMIPS : 1980.41
diff --git a/unit_test/testdata/mips_loongson3.txt b/unit_test/testdata/mips_loongson3.txt
new file mode 100644
index 00000000..1f540b12
--- /dev/null
+++ b/unit_test/testdata/mips_loongson3.txt
@@ -0,0 +1,10 @@
+system type : generic-loongson-machine
+machine : Unknown
+processor : 0
+cpu model : ICT Loongson-3 V0.9 FPU V0.1
+model name : ICT Loongson-3A R3 (Loongson-3A3000) @ 1500MHz
+BogoMIPS : 2990.15
+
+isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented : dsp dsp2 vz
+shadow register sets : 1
diff --git a/unit_test/testdata/mips_loongson_mmi.txt b/unit_test/testdata/mips_loongson_mmi.txt
new file mode 100644
index 00000000..0f10b8bb
--- /dev/null
+++ b/unit_test/testdata/mips_loongson_mmi.txt
@@ -0,0 +1,7 @@
+system type : generic-loongson-machine
+machine : loongson,generic
+processor : 0
+
+isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented : vz loongson-mmi loongson-ext
+shadow register sets : 1
diff --git a/unit_test/testdata/mips_msa.txt b/unit_test/testdata/mips_msa.txt
new file mode 100644
index 00000000..ac930615
--- /dev/null
+++ b/unit_test/testdata/mips_msa.txt
@@ -0,0 +1,7 @@
+system type : generic-loongson-machine
+machine : loongson,generic
+processor : 0
+
+isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented : vz msa
+shadow register sets : 1
diff --git a/unit_test/testdata/riscv64.txt b/unit_test/testdata/riscv64.txt
new file mode 100644
index 00000000..fbb4200f
--- /dev/null
+++ b/unit_test/testdata/riscv64.txt
@@ -0,0 +1,4 @@
+processor : 0
+hart : 1
+isa : rv64imac
+mmu : sv48 \ No newline at end of file
diff --git a/unit_test/testdata/riscv64_rvv.txt b/unit_test/testdata/riscv64_rvv.txt
new file mode 100644
index 00000000..af1b3f36
--- /dev/null
+++ b/unit_test/testdata/riscv64_rvv.txt
@@ -0,0 +1,4 @@
+processor : 0
+hart : 1
+isa : rv64imafdcv
+mmu : sv48 \ No newline at end of file
diff --git a/unit_test/testdata/riscv64_rvv_zvfh.txt b/unit_test/testdata/riscv64_rvv_zvfh.txt
new file mode 100644
index 00000000..c416c1af
--- /dev/null
+++ b/unit_test/testdata/riscv64_rvv_zvfh.txt
@@ -0,0 +1,4 @@
+processor : 0
+hart : 1
+isa : rv64imafdcv_zfh_zvfh
+mmu : sv48 \ No newline at end of file
diff --git a/files/unit_test/testdata/tegra3.txt b/unit_test/testdata/tegra3.txt
index d1b09f6b..d1b09f6b 100644
--- a/files/unit_test/testdata/tegra3.txt
+++ b/unit_test/testdata/tegra3.txt
diff --git a/files/unit_test/testdata/test0.jpg b/unit_test/testdata/test0.jpg
index f4461a81..f4461a81 100644
--- a/files/unit_test/testdata/test0.jpg
+++ b/unit_test/testdata/test0.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test1.jpg b/unit_test/testdata/test1.jpg
index a0210e9d..a0210e9d 100644
--- a/files/unit_test/testdata/test1.jpg
+++ b/unit_test/testdata/test1.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test2.jpg b/unit_test/testdata/test2.jpg
index 816ca767..816ca767 100644
--- a/files/unit_test/testdata/test2.jpg
+++ b/unit_test/testdata/test2.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test3.jpg b/unit_test/testdata/test3.jpg
index 792d91dc..792d91dc 100644
--- a/files/unit_test/testdata/test3.jpg
+++ b/unit_test/testdata/test3.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test4.jpg b/unit_test/testdata/test4.jpg
index 1ef41668..1ef41668 100644
--- a/files/unit_test/testdata/test4.jpg
+++ b/unit_test/testdata/test4.jpg
Binary files differ
diff --git a/files/unit_test/unit_test.cc b/unit_test/unit_test.cc
index a1ae7ea3..239d5b92 100644
--- a/files/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -14,23 +14,28 @@
#include <cstring>
-#ifdef LIBYUV_USE_GFLAGS
-#include "gflags/gflags.h"
+#ifdef LIBYUV_USE_ABSL_FLAGS
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
#endif
#include "libyuv/cpu_id.h"
unsigned int fastrand_seed = 0xfb;
-#ifdef LIBYUV_USE_GFLAGS
-DEFINE_int32(libyuv_width, 0, "width of test image.");
-DEFINE_int32(libyuv_height, 0, "height of test image.");
-DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
-DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD");
-DEFINE_int32(libyuv_cpu_info,
- 0,
- "cpu flags for benchmark code. 1 = C, -1 = SIMD");
+#ifdef LIBYUV_USE_ABSL_FLAGS
+ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image.");
+ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image.");
+ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test.");
+ABSL_FLAG(int32_t,
+ libyuv_flags,
+ 0,
+ "cpu flags for reference code. 1 = C, -1 = SIMD");
+ABSL_FLAG(int32_t,
+ libyuv_cpu_info,
+ 0,
+ "cpu flags for benchmark code. 1 = C, -1 = SIMD");
#else
-// Disable command line parameters if gflags disabled.
+// Disable command line parameters if absl/flags disabled.
static const int32_t FLAGS_libyuv_width = 0;
static const int32_t FLAGS_libyuv_height = 0;
static const int32_t FLAGS_libyuv_repeat = 0;
@@ -38,6 +43,12 @@ static const int32_t FLAGS_libyuv_flags = 0;
static const int32_t FLAGS_libyuv_cpu_info = 0;
#endif
+#ifdef LIBYUV_USE_ABSL_FLAGS
+#define LIBYUV_GET_FLAG(f) absl::GetFlag(f)
+#else
+#define LIBYUV_GET_FLAG(f) f
+#endif
+
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
#if !defined(__native_client__) && !defined(_M_ARM)
@@ -66,8 +77,20 @@ int TestCpuEnv(int cpu_info) {
if (TestEnv("LIBYUV_DISABLE_MSA")) {
cpu_info &= ~libyuv::kCpuHasMSA;
}
- if (TestEnv("LIBYUV_DISABLE_MMI")) {
- cpu_info &= ~libyuv::kCpuHasMMI;
+#endif
+#if defined(__longarch__) && defined(__linux__)
+ if (TestEnv("LIBYUV_DISABLE_LSX")) {
+ cpu_info &= ~libyuv::kCpuHasLSX;
+ }
+#endif
+#if defined(__longarch__) && defined(__linux__)
+ if (TestEnv("LIBYUV_DISABLE_LASX")) {
+ cpu_info &= ~libyuv::kCpuHasLASX;
+ }
+#endif
+#if defined(__riscv) && defined(__linux__)
+ if (TestEnv("LIBYUV_DISABLE_RVV")) {
+ cpu_info &= ~libyuv::kCpuHasRVV;
}
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
@@ -109,6 +132,9 @@ int TestCpuEnv(int cpu_info) {
if (TestEnv("LIBYUV_DISABLE_AVX512VL")) {
cpu_info &= ~libyuv::kCpuHasAVX512VL;
}
+ if (TestEnv("LIBYUV_DISABLE_AVX512VNNI")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512VNNI;
+ }
if (TestEnv("LIBYUV_DISABLE_AVX512VBMI")) {
cpu_info &= ~libyuv::kCpuHasAVX512VBMI;
}
@@ -118,11 +144,14 @@ int TestCpuEnv(int cpu_info) {
if (TestEnv("LIBYUV_DISABLE_AVX512VBITALG")) {
cpu_info &= ~libyuv::kCpuHasAVX512VBITALG;
}
- if (TestEnv("LIBYUV_DISABLE_AVX512VPOPCNTDQ")) {
- cpu_info &= ~libyuv::kCpuHasAVX512VPOPCNTDQ;
+ if (TestEnv("LIBYUV_DISABLE_AVX10")) {
+ cpu_info &= ~libyuv::kCpuHasAVX10;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVXVNNI")) {
+ cpu_info &= ~libyuv::kCpuHasAVXVNNI;
}
- if (TestEnv("LIBYUV_DISABLE_GFNI")) {
- cpu_info &= ~libyuv::kCpuHasGFNI;
+ if (TestEnv("LIBYUV_DISABLE_AVXVNNIINT8")) {
+ cpu_info &= ~libyuv::kCpuHasAVXVNNIINT8;
}
#endif
if (TestEnv("LIBYUV_DISABLE_ASM")) {
@@ -145,8 +174,8 @@ LibYUVConvertTest::LibYUVConvertTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -156,29 +185,29 @@ LibYUVConvertTest::LibYUVConvertTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -201,8 +230,8 @@ LibYUVColorTest::LibYUVColorTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -212,29 +241,29 @@ LibYUVColorTest::LibYUVColorTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -257,8 +286,8 @@ LibYUVScaleTest::LibYUVScaleTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -268,29 +297,29 @@ LibYUVScaleTest::LibYUVScaleTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -313,8 +342,8 @@ LibYUVRotateTest::LibYUVRotateTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -324,29 +353,29 @@ LibYUVRotateTest::LibYUVRotateTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -369,8 +398,8 @@ LibYUVPlanarTest::LibYUVPlanarTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -380,29 +409,29 @@ LibYUVPlanarTest::LibYUVPlanarTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -425,8 +454,8 @@ LibYUVBaseTest::LibYUVBaseTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -436,29 +465,29 @@ LibYUVBaseTest::LibYUVBaseTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -481,8 +510,8 @@ LibYUVCompareTest::LibYUVCompareTest()
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
- if (FLAGS_libyuv_repeat) {
- benchmark_iterations_ = FLAGS_libyuv_repeat;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+ benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
@@ -492,29 +521,29 @@ LibYUVCompareTest::LibYUVCompareTest()
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
- if (FLAGS_libyuv_width) {
- benchmark_width_ = FLAGS_libyuv_width;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+ benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- if (FLAGS_libyuv_height) {
- benchmark_height_ = FLAGS_libyuv_height;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+ benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_flags) {
- disable_cpu_flags_ = FLAGS_libyuv_flags;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+ disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
- if (FLAGS_libyuv_cpu_info) {
- benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+ benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
}
disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -529,11 +558,8 @@ LibYUVCompareTest::LibYUVCompareTest()
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
-#ifdef LIBYUV_USE_GFLAGS
- // AllowCommandLineParsing allows us to ignore flags passed on to us by
- // Chromium build bots without having to explicitly disable them.
- google::AllowCommandLineReparsing();
- google::ParseCommandLineFlags(&argc, &argv, true);
+#ifdef LIBYUV_USE_ABSL_FLAGS
+ absl::ParseCommandLine(argc, argv);
#endif
return RUN_ALL_TESTS();
}
diff --git a/files/unit_test/unit_test.h b/unit_test/unit_test.h
index 87907fa1..99cc8d19 100644
--- a/files/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -11,10 +11,10 @@
#ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT
#define UNIT_TEST_UNIT_TEST_H_
-#ifdef WIN32
+#include <stddef.h> // For NULL
+#ifdef _WIN32
#include <windows.h>
#else
-#include <sys/resource.h>
#include <sys/time.h>
#endif
@@ -77,7 +77,18 @@ static inline bool SizeValid(int src_width,
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \
- var = 0
+ var = NULL
+
+#define align_buffer_page_end_16(var, size) \
+ uint8_t* var##_mem = \
+ reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095)); \
+ uint16_t* var = reinterpret_cast<uint16_t*>( \
+ (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
+ ~63)
+
+#define free_aligned_buffer_page_end_16(var) \
+ free(var##_mem); \
+ var = NULL
#ifdef WIN32
static inline double get_time() {
@@ -111,10 +122,13 @@ inline int fastrand() {
return static_cast<int>((fastrand_seed >> 16) & 0xffff);
}
+// ubsan fails if dst is unaligned unless we use uint8
static inline void MemRandomize(uint8_t* dst, int64_t len) {
int64_t i;
for (i = 0; i < len - 1; i += 2) {
- *reinterpret_cast<uint16_t*>(dst) = fastrand();
+ int r = fastrand();
+ dst[0] = static_cast<uint8_t>(r);
+ dst[1] = static_cast<uint8_t>(r >> 8);
dst += 2;
}
for (; i < len; ++i) {
diff --git a/files/unit_test/video_common_test.cc b/unit_test/video_common_test.cc
index a84206a2..36728ea9 100644
--- a/files/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
@@ -29,7 +29,7 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) {
!TestValidChar((fourcc >> 24) & 0xff)) {
return false;
}
- if (bpp < 0 || bpp > 32) {
+ if (bpp < 0 || bpp > 64) {
return false;
}
return true;
@@ -65,13 +65,15 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
- EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420)); // deprecated.
EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420)); // deprecated.
EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
@@ -81,6 +83,11 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
diff --git a/files/util/Makefile b/util/Makefile
index 40e74b65..40e74b65 100644
--- a/files/util/Makefile
+++ b/util/Makefile
diff --git a/util/color.cc b/util/color.cc
new file mode 100644
index 00000000..8c3bbefd
--- /dev/null
+++ b/util/color.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// For those MCs that can be represented as kr and kb:
+// Full range
+// float M[3][3]
+// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3]
+// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// Limited range
+// float M[3][3]
+// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3]
+// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+
+// mc bt
+// 1 bt.709 KR = 0.2126; KB = 0.0722
+// 4 fcc KR = 0.30; KB = 0.11
+// 6 bt.601 KR = 0.299; KB = 0.114
+// 7 SMPTE 240M KR = 0.212; KB = 0.087
+// 10 bt2020 KR = 0.2627; KB = 0.0593
+
+// BT.709 full range YUV to RGB reference
+// R = Y + V * 1.5748
+// G = Y - U * 0.18732 - V * 0.46812
+// B = Y + U * 1.8556
+// KR = 0.2126
+// KB = 0.0722
+
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// // Y contribution to R,G,B. Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32 /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22 /* round(0.34414 * 64) */
+// #define VG 46 /* round(0.71414 * 64) */
+// #define VR 90 /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int round(float v) {
+ return (int)(v + 0.5);
+}
+
+int main(int argc, const char* argv[]) {
+ if (argc < 2) {
+ printf("color kr kb\n");
+ return -1;
+ }
+ float kr = atof(argv[1]);
+ float kb = atof(argv[2]);
+ float kg = 1 - kr - kb;
+
+ float vr = 2 * (1 - kr);
+ float ug = 2 * ((1 - kb) * kb / kg);
+ float vg = 2 * ((1 - kr) * kr / kg);
+ float ub = 2 * (1 - kb);
+
+ printf("Full range\n");
+ printf("R = Y + V * %5f\n", vr);
+ printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+ printf("B = Y + U * %5f\n", ub);
+
+ printf("KR = %4f; ", kr);
+ printf("KB = %4f\n", kb);
+ // printf("KG = %4f\n", kg);
+ // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+ // #define YB 32 /* 64 / 2 */
+ //
+ // // U and V contributions to R,G,B.
+
+ printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+ printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+ printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+ printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+ vr = 255.f / 224.f * 2 * (1 - kr);
+ ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+ vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+ ub = 255.f / 224.f * 2 * (1 - kb);
+
+ printf("Limited range\n");
+ printf("R = (Y - 16) * 1.164 + V * %5f\n", vr);
+ printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+ printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+ // printf("KG = %4f\n", kg);
+ // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+ // #define YB 32 /* 64 / 2 */
+ //
+ // // U and V contributions to R,G,B.
+
+ printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+ printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+ printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+ printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+ return 0;
+}
diff --git a/files/util/compare.cc b/util/compare.cc
index a16613ee..a16613ee 100644
--- a/files/util/compare.cc
+++ b/util/compare.cc
diff --git a/files/util/cpuid.c b/util/cpuid.c
index 84c06022..c07e6e95 100644
--- a/files/util/cpuid.c
+++ b/util/cpuid.c
@@ -12,16 +12,19 @@
#include <stdlib.h>
#include <string.h>
-#define INCLUDE_LIBYUV_COMPARE_H_
-#include "libyuv.h"
-#include "./psnr.h"
-#include "./ssim.h"
+#include "libyuv/cpu_id.h"
+
+#ifdef __cplusplus
+using namespace libyuv;
+#endif
int main(int argc, const char* argv[]) {
int cpu_flags = TestCpuFlag(-1);
int has_arm = TestCpuFlag(kCpuHasARM);
- int has_mips = TestCpuFlag(kCpuHasMIPS);
+ int has_riscv = TestCpuFlag(kCpuHasRISCV);
int has_x86 = TestCpuFlag(kCpuHasX86);
+ int has_mips = TestCpuFlag(kCpuHasMIPS);
+ int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
(void)argc;
(void)argv;
@@ -60,19 +63,28 @@ int main(int argc, const char* argv[]) {
model, model);
}
#endif
- printf("Cpu Flags %x\n", cpu_flags);
- printf("Has ARM %x\n", has_arm);
- printf("Has MIPS %x\n", has_mips);
- printf("Has X86 %x\n", has_x86);
+ printf("Cpu Flags 0x%x\n", cpu_flags);
if (has_arm) {
int has_neon = TestCpuFlag(kCpuHasNEON);
- printf("Has NEON %x\n", has_neon);
+ printf("Has ARM 0x%x\n", has_arm);
+ printf("Has NEON 0x%x\n", has_neon);
+ }
+ if (has_riscv) {
+ int has_rvv = TestCpuFlag(kCpuHasRVV);
+ printf("Has RISCV 0x%x\n", has_riscv);
+ printf("Has RVV 0x%x\n", has_rvv);
}
if (has_mips) {
int has_msa = TestCpuFlag(kCpuHasMSA);
- printf("Has MSA %x\n", has_msa);
- int has_mmi = TestCpuFlag(kCpuHasMMI);
- printf("Has MMI %x\n", has_mmi);
+ printf("Has MIPS 0x%x\n", has_mips);
+ printf("Has MSA 0x%x\n", has_msa);
+ }
+ if (has_loongarch) {
+ int has_lsx = TestCpuFlag(kCpuHasLSX);
+ int has_lasx = TestCpuFlag(kCpuHasLASX);
+ printf("Has LOONGARCH 0x%x\n", has_loongarch);
+ printf("Has LSX 0x%x\n", has_lsx);
+ printf("Has LASX 0x%x\n", has_lasx);
}
if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -83,30 +95,35 @@ int main(int argc, const char* argv[]) {
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
- int has_f16c = TestCpuFlag(kCpuHasF16C);
- int has_gfni = TestCpuFlag(kCpuHasGFNI);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+ int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
- int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
- printf("Has SSE2 %x\n", has_sse2);
- printf("Has SSSE3 %x\n", has_ssse3);
- printf("Has SSE4.1 %x\n", has_sse41);
- printf("Has SSE4.2 %x\n", has_sse42);
- printf("Has AVX %x\n", has_avx);
- printf("Has AVX2 %x\n", has_avx2);
- printf("Has ERMS %x\n", has_erms);
- printf("Has FMA3 %x\n", has_fma3);
- printf("Has F16C %x\n", has_f16c);
- printf("Has GFNI %x\n", has_gfni);
- printf("Has AVX512BW %x\n", has_avx512bw);
- printf("Has AVX512VL %x\n", has_avx512vl);
- printf("Has AVX512VBMI %x\n", has_avx512vbmi);
- printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2);
- printf("Has AVX512VBITALG %x\n", has_avx512vbitalg);
- printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq);
+ int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+ int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+ int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+ printf("Has X86 0x%x\n", has_x86);
+ printf("Has SSE2 0x%x\n", has_sse2);
+ printf("Has SSSE3 0x%x\n", has_ssse3);
+ printf("Has SSE4.1 0x%x\n", has_sse41);
+ printf("Has SSE4.2 0x%x\n", has_sse42);
+ printf("Has AVX 0x%x\n", has_avx);
+ printf("Has AVX2 0x%x\n", has_avx2);
+ printf("Has ERMS 0x%x\n", has_erms);
+ printf("Has FMA3 0x%x\n", has_fma3);
+ printf("Has F16C 0x%x\n", has_f16c);
+ printf("Has AVX512BW 0x%x\n", has_avx512bw);
+ printf("Has AVX512VL 0x%x\n", has_avx512vl);
+ printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+ printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+ printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+ printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+ printf("Has AVX10 0x%x\n", has_avx10);
+ printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+ printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
}
return 0;
}
diff --git a/util/i444tonv12_eg.cc b/util/i444tonv12_eg.cc
new file mode 100644
index 00000000..0fcb4095
--- /dev/null
+++ b/util/i444tonv12_eg.cc
@@ -0,0 +1,28 @@
+
+#include "libyuv/convert.h"
+
+#include <stdio.h> // for printf
+#include <string.h> // for memset
+
+int main(int, char**) {
+ unsigned char src_i444[640 * 400 * 3];
+ unsigned char dst_nv12[640 * 400 * 3 / 2];
+
+ for (size_t i = 0; i < sizeof(src_i444); ++i) {
+ src_i444[i] = i & 255;
+ }
+ memset(dst_nv12, 0, sizeof(dst_nv12));
+ libyuv::I444ToNV12(&src_i444[0], 640, // source Y
+ &src_i444[640 * 400], 640, // source U
+ &src_i444[640 * 400 * 2], 640, // source V
+ &dst_nv12[0], 640, // dest Y
+ &dst_nv12[640 * 400], 640, // dest UV
+ 640, 400); // width and height
+
+ int checksum = 0;
+ for (size_t i = 0; i < sizeof(dst_nv12); ++i) {
+ checksum += dst_nv12[i];
+ }
+ printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL");
+ return 0;
+} \ No newline at end of file
diff --git a/files/util/psnr.cc b/util/psnr.cc
index c7bee7f9..c7bee7f9 100644
--- a/files/util/psnr.cc
+++ b/util/psnr.cc
diff --git a/files/util/psnr.h b/util/psnr.h
index aac128cb..aac128cb 100644
--- a/files/util/psnr.h
+++ b/util/psnr.h
diff --git a/files/util/psnr_main.cc b/util/psnr_main.cc
index a930b202..8b9fd972 100644
--- a/files/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -248,13 +248,13 @@ bool UpdateMetrics(uint8_t* ch_org,
int number_of_frames,
metric* cur_distortion_psnr,
metric* distorted_frame,
- bool do_psnr) {
+ bool compute_psnr) {
const int uv_offset = (do_swap_uv ? uv_size : 0);
const uint8_t* const u_org = ch_org + y_size + uv_offset;
const uint8_t* const u_rec = ch_rec + y_size;
const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset);
const uint8_t* const v_rec = ch_rec + y_size + uv_size;
- if (do_psnr) {
+ if (compute_psnr) {
#ifdef HAVE_JPEG
double y_err = static_cast<double>(
libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
diff --git a/files/util/ssim.cc b/util/ssim.cc
index 096fbcf0..096fbcf0 100644
--- a/files/util/ssim.cc
+++ b/util/ssim.cc
diff --git a/files/util/ssim.h b/util/ssim.h
index a855f1d1..a855f1d1 100644
--- a/files/util/ssim.h
+++ b/util/ssim.h
diff --git a/util/yuvconstants.c b/util/yuvconstants.c
new file mode 100644
index 00000000..4e5185af
--- /dev/null
+++ b/util/yuvconstants.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// See Also
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// BT.709 full range YUV to RGB reference
+// R = Y + V * 1.5748
+// G = Y - U * 0.18732 - V * 0.46812
+// B = Y + U * 1.8556
+// KR = 0.2126
+// KB = 0.0722
+
+// // Y contribution to R,G,B. Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32 /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22 /* round(0.34414 * 64) */
+// #define VG 46 /* round(0.71414 * 64) */
+// #define VR 90 /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int main(int argc, const char* argv[]) {
+ if (argc < 3) {
+ printf("yuvconstants [KR] [KB]\n");
+ printf(" e.g. yuvconstants 0.2126 0.0722\n");
+ printf(" MC BT KR KB\n");
+ printf(" 1 BT.709 KR = 0.2126; KB = 0.0722\n");
+ printf(" 4 FCC KR = 0.30; KB = 0.11\n");
+ printf(" 6 BT.601 KR = 0.299; KB = 0.114\n");
+ printf(" 7 SMPTE 240M KR = 0.212; KB = 0.087\n");
+ printf(" 9 BT.2020 KR = 0.2627; KB = 0.0593\n");
+ return -1;
+ }
+ float kr = (float)atof(argv[1]);
+ float kb = (float)atof(argv[2]);
+ float kg = 1 - kr - kb;
+
+ float vr = 2 * (1 - kr);
+ float ug = 2 * ((1 - kb) * kb / kg);
+ float vg = 2 * ((1 - kr) * kr / kg);
+ float ub = 2 * (1 - kb);
+
+ printf("Full range\n");
+ printf("R = Y + V * %5f\n", vr);
+ printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+ printf("B = Y + U * %5f\n", ub);
+
+ printf("KR = %4f; ", kr);
+ printf("KB = %4f\n", kb);
+ // printf("KG = %4f\n", kg);
+ // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+ // #define YB 32 /* 64 / 2 */
+ //
+ // // U and V contributions to R,G,B.
+
+ printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
+ printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
+ printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
+ printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+
+ vr = 255.f / 224.f * 2 * (1 - kr);
+ ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+ vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+ ub = 255.f / 224.f * 2 * (1 - kb);
+
+ printf("\nLimited range\n");
+ printf("R = (Y - 16) * 1.164 + V * %5f\n", vr);
+ printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+ printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+ // printf("KG = %4f\n", kg);
+ // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+ // #define YB 32 /* 64 / 2 */
+ //
+ // // U and V contributions to R,G,B.
+
+ printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
+ printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
+ printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
+ printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+
+ return 0;
+}
diff --git a/files/util/yuvconvert.cc b/util/yuvconvert.cc
index 27cdfe9e..93b52668 100644
--- a/files/util/yuvconvert.cc
+++ b/util/yuvconvert.cc
@@ -42,9 +42,9 @@ static __inline uint32_t Abs(int32_t v) {
}
// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
-bool ExtractResolutionFromFilename(const char* name,
- int* width_ptr,
- int* height_ptr) {
+static bool ExtractResolutionFromFilename(const char* name,
+ int* width_ptr,
+ int* height_ptr) {
// Isolate the .width_height. section of the filename by searching for a
// dot or underscore followed by a digit.
for (int i = 0; name[i]; ++i) {
@@ -59,7 +59,7 @@ bool ExtractResolutionFromFilename(const char* name,
return false;
}
-void PrintHelp(const char* program) {
+static void PrintHelp(const char* program) {
printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
printf(
" -s <width> <height> .... specify source resolution. "
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) {
exit(0);
}
-void ParseOptions(int argc, const char* argv[]) {
+static void ParseOptions(int argc, const char* argv[]) {
if (argc <= 1) {
PrintHelp(argv[0]);
}
@@ -165,23 +165,23 @@ static int TileARGBScale(const uint8_t* src_argb,
int src_height,
uint8_t* dst_argb,
int dst_stride_argb,
- int dst_width,
- int dst_height,
+ int destination_width,
+ int destination_height,
libyuv::FilterMode filtering) {
- for (int y = 0; y < dst_height; y += kTileY) {
- for (int x = 0; x < dst_width; x += kTileX) {
+ for (int y = 0; y < destination_height; y += kTileY) {
+ for (int x = 0; x < destination_width; x += kTileX) {
int clip_width = kTileX;
- if (x + clip_width > dst_width) {
- clip_width = dst_width - x;
+ if (x + clip_width > destination_width) {
+ clip_width = destination_width - x;
}
int clip_height = kTileY;
- if (y + clip_height > dst_height) {
- clip_height = dst_height - y;
+ if (y + clip_height > destination_height) {
+ clip_height = destination_height - y;
}
int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width,
src_height, dst_argb, dst_stride_argb,
- dst_width, dst_height, x, y, clip_width,
- clip_height, filtering);
+ destination_width, destination_height, x, y,
+ clip_width, clip_height, filtering);
if (r) {
return r;
}
diff --git a/files/winarm.mk b/winarm.mk
index c4307a43..b0a344ae 100644
--- a/files/winarm.mk
+++ b/winarm.mk
@@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \
source/scale_any.o\
source/scale_argb.o\
source/scale_common.o\
+ source/scale_uv.o\
source/video_common.o
.cc.o: